[SPARK-51394][ML] Optimize out the additional shuffle in stats tests

zhengruifeng · zhengruifeng · commit 26545d000bc8 · 2025-03-05T16:07:42.000+08:00
### What changes were proposed in this pull request? Optimize out the additional shuffle in stats tests ### Why are the changes needed? for simplification ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #50166 from zhengruifeng/ml_cst. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala
@@ -69,20 +69,18 @@ private[ml] object ANOVATest {
     val spark = dataset.sparkSession
     import spark.implicits._
 
-    val resultDF = testClassification(dataset, featuresCol, labelCol)
-      .toDF("featureIndex", "pValue", "degreesOfFreedom", "fValue")
+    val resRdd = testClassification(dataset, featuresCol, labelCol)
 
     if (flatten) {
-      resultDF
+      resRdd.toDF("featureIndex", "pValue", "degreesOfFreedom", "fValue")
     } else {
-      resultDF.agg(collect_list(struct("*")))
-        .as[Seq[(Int, Double, Long, Double)]]
-        .map { seq =>
-          val results = seq.toArray.sortBy(_._1)
-          val pValues = Vectors.dense(results.map(_._2))
-          val degreesOfFreedom = results.map(_._3)
-          val fValues = Vectors.dense(results.map(_._4))
-          (pValues, degreesOfFreedom, fValues)
+      resRdd.coalesce(1)
+        .mapPartitions { iter =>
+          val res = iter.toArray.sortBy(_._1)
+          val pValues = Vectors.dense(res.map(_._2))
+          val degreesOfFreedom = res.map(_._3)
+          val fValues = Vectors.dense(res.map(_._4))
+          Iterator.single((pValues, degreesOfFreedom, fValues))
         }.toDF("pValues", "degreesOfFreedom", "fValues")
     }
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala
@@ -80,22 +80,21 @@ object ChiSquareTest {
     val data = dataset.select(col(labelCol).cast("double"), col(featuresCol)).rdd
       .map { case Row(label: Double, vec: Vector) => (label, OldVectors.fromML(vec)) }
 
-    val resultDF = OldChiSqTest.computeChiSquared(data)
+    val resRDD = OldChiSqTest.computeChiSquared(data)
       .map { case (col, pValue, degreesOfFreedom, statistic, _) =>
         (col, pValue, degreesOfFreedom, statistic)
-      }.toDF("featureIndex", "pValue", "degreesOfFreedom", "statistic")
+      }
 
     if (flatten) {
-      resultDF
+      resRDD.toDF("featureIndex", "pValue", "degreesOfFreedom", "statistic")
     } else {
-      resultDF.agg(collect_list(struct("*")))
-        .as[Seq[(Int, Double, Int, Double)]]
-        .map { seq =>
-          val results = seq.toArray.sortBy(_._1)
-          val pValues = Vectors.dense(results.map(_._2))
-          val degreesOfFreedom = results.map(_._3)
-          val statistics = Vectors.dense(results.map(_._4))
-          (pValues, degreesOfFreedom, statistics)
+      resRDD.coalesce(1)
+        .mapPartitions { iter =>
+          val res = iter.toArray.sortBy(_._1)
+          val pValues = Vectors.dense(res.map(_._2))
+          val degreesOfFreedom = res.map(_._3)
+          val statistics = Vectors.dense(res.map(_._4))
+          Iterator.single((pValues, degreesOfFreedom, statistics))
         }.toDF("pValues", "degreesOfFreedom", "statistics")
     }
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
@@ -70,20 +70,21 @@ private[ml] object FValueTest {
     val spark = dataset.sparkSession
     import spark.implicits._
 
+    val resRDD = testRegression(dataset, featuresCol, labelCol)
+
     val resultDF = testRegression(dataset, featuresCol, labelCol)
       .toDF("featureIndex", "pValue", "degreesOfFreedom", "fValue")
 
     if (flatten) {
-      resultDF
+      resRDD.toDF("featureIndex", "pValue", "degreesOfFreedom", "fValue")
     } else {
-      resultDF.agg(collect_list(struct("*")))
-        .as[Seq[(Int, Double, Long, Double)]]
-        .map { seq =>
-          val results = seq.toArray.sortBy(_._1)
-          val pValues = Vectors.dense(results.map(_._2))
-          val degreesOfFreedom = results.map(_._3)
-          val fValues = Vectors.dense(results.map(_._4))
-          (pValues, degreesOfFreedom, fValues)
+      resRDD.coalesce(1)
+        .mapPartitions { iter =>
+          val res = iter.toArray.sortBy(_._1)
+          val pValues = Vectors.dense(res.map(_._2))
+          val degreesOfFreedom = res.map(_._3)
+          val fValues = Vectors.dense(res.map(_._4))
+          Iterator.single((pValues, degreesOfFreedom, fValues))
         }.toDF("pValues", "degreesOfFreedom", "fValues")
     }
   }