[SPARK-51856][ML][CONNECT] Update model size API to count distributed DataFrame size

WeichenXu123 · WeichenXu123 · commit cdd52963281a · 2025-04-23T08:21:06.000+08:00
### What changes were proposed in this pull request? Update model size API to count distributed DataFrame size ### Why are the changes needed? For Spark server ML cache management. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #50652 from WeichenXu123/get-model-ser-size-api. Lead-authored-by: Weichen Xu <weichen.xu@databricks.com> Co-authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
@@ -87,8 +87,8 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage {
    * Estimate an upper-bound size of the model to be fitted in bytes, based on the
    * parameters and the dataset, e.g., using $(k) and numFeatures to estimate a
    * k-means model size.
-   * 1, Only driver side memory usage is counted, distributed objects (like DataFrame,
-   * RDD, Graph, Summary) are ignored.
+   * 1, Both driver side memory usage and distributed objects size (like DataFrame,
+   * RDD, Graph, Summary) are counted.
    * 2, Lazy vals are not counted, e.g., an auxiliary object used in prediction.
    * 3, If there is no enough information to get an accurate size, try to estimate the
    * upper-bound size, e.g.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -49,8 +49,8 @@ abstract class Model[M <: Model[M]] extends Transformer { self =>
    * For ml connect only.
    * Estimate the size of this model in bytes.
    * This is an approximation, the real size might be different.
-   * 1, Only driver side memory usage is counted, distributed objects (like DataFrame,
-   * RDD, Graph, Summary) are ignored.
+   * 1, Both driver side memory usage and distributed objects size (like DataFrame,
+   * RDD, Graph, Summary) are counted.
    * 2, Lazy vals are not counted, e.g., an auxiliary object used in prediction.
    * 3, The default implementation uses `org.apache.spark.util.SizeEstimator.estimate`,
    *    some models override the default implementation to achieve more precise estimation.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -805,6 +805,11 @@ class DistributedLDAModel private[ml] (
   override def toString: String = {
     s"DistributedLDAModel: uid=$uid, k=${$(k)}, numFeatures=$vocabSize"
   }
+
+  override def estimatedSize: Long = {
+    // TODO: Implement this method.
+    throw new UnsupportedOperationException
+  }
 }
 
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -322,6 +322,11 @@ class FPGrowthModel private[ml] (
   override def toString: String = {
     s"FPGrowthModel: uid=$uid, numTrainingRecords=$numTrainingRecords"
   }
+
+  override def estimatedSize: Long = {
+    // TODO: Implement this method.
+    throw new UnsupportedOperationException
+  }
 }
 
 @Since("2.2.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -540,6 +540,11 @@ class ALSModel private[ml] (
     }
   }
 
+  override def estimatedSize: Long = {
+    val userCount = userFactors.count()
+    val itemCount = itemFactors.count()
+    (userCount + itemCount) * (rank + 1) * 4
+  }
 }
 
 @Since("1.6.0")
@@ -771,6 +776,13 @@ class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel]
 
   @Since("1.5.0")
   override def copy(extra: ParamMap): ALS = defaultCopy(extra)
+
+  override def estimateModelSize(dataset: Dataset[_]): Long = {
+    val userCount = dataset.select(getUserCol).distinct().count()
+    val itemCount = dataset.select(getItemCol).distinct().count()
+    val rank = getRank
+    (userCount + itemCount) * (rank + 1) * 4
+  }
 }
 
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -1128,6 +1128,24 @@ class ALSStorageSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     levels.foreach(level => assert(level == StorageLevel.MEMORY_ONLY))
     nonDefaultListener.storageLevels.foreach(level => assert(level == StorageLevel.DISK_ONLY))
   }
+
+  test("saved model size estimation") {
+    import testImplicits._
+
+    val als = new ALS().setMaxIter(1).setRank(8)
+    val estimatedDFSize = (3 + 2) * (8 + 1) * 4
+    val df = sc.parallelize(Seq(
+      (123, 1, 0.5),
+      (123, 2, 0.7),
+      (123, 3, 0.6),
+      (111, 2, 1.0),
+      (111, 1, 0.1)
+    )).toDF("item", "user", "rating")
+    assert(als.estimateModelSize(df) === estimatedDFSize)
+
+    val model = als.fit(df)
+    assert(model.estimatedSize == estimatedDFSize)
+  }
 }
 
 private class IntermediateRDDStorageListener extends SparkListener {
diff --git a/python/pyspark/ml/tests/test_clustering.py b/python/pyspark/ml/tests/test_clustering.py
@@ -37,6 +37,7 @@
     DistributedLDAModel,
     PowerIterationClustering,
 )
+from pyspark.sql import is_remote
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 
 
@@ -377,6 +378,8 @@ def test_local_lda(self):
             self.assertEqual(str(model), str(model2))
 
     def test_distributed_lda(self):
+        if is_remote():
+            self.skipTest("Do not support Spark Connect.")
         spark = self.spark
         df = (
             spark.createDataFrame(
diff --git a/python/pyspark/ml/tests/test_fpm.py b/python/pyspark/ml/tests/test_fpm.py
@@ -18,7 +18,7 @@
 import tempfile
 import unittest
 
-from pyspark.sql import Row
+from pyspark.sql import is_remote, Row
 import pyspark.sql.functions as sf
 from pyspark.ml.fpm import (
     FPGrowth,
@@ -30,6 +30,8 @@
 
 class FPMTestsMixin:
     def test_fp_growth(self):
+        if is_remote():
+            self.skipTest("Do not support Spark Connect.")
         df = self.spark.createDataFrame(
             [
                 ["r z h k p"],

Original file line number	Diff line number	Diff line change
`@@ -805,6 +805,11 @@ class DistributedLDAModel private[ml] (`
`805`	`805`	`override def toString: String = {`
`806`	`806`	`s"DistributedLDAModel: uid=$uid, k=${$(k)}, numFeatures=$vocabSize"`
`807`	`807`	`}`
	`808`	`+`
	`809`	`+ override def estimatedSize: Long = {`
	`810`	`+ // TODO: Implement this method.`
	`811`	`+ throw new UnsupportedOperationException`
	`812`	`+ }`
`808`	`813`	`}`
`809`	`814`
`810`	`815`
Original file line number	Diff line number	Diff line change
`@@ -322,6 +322,11 @@ class FPGrowthModel private[ml] (`
`322`	`322`	`override def toString: String = {`
`323`	`323`	`s"FPGrowthModel: uid=$uid, numTrainingRecords=$numTrainingRecords"`
`324`	`324`	`}`
	`325`	`+`
	`326`	`+ override def estimatedSize: Long = {`
	`327`	`+ // TODO: Implement this method.`
	`328`	`+ throw new UnsupportedOperationException`
	`329`	`+ }`
`325`	`330`	`}`
`326`	`331`
`327`	`332`	`@Since("2.2.0")`
Original file line number	Diff line number	Diff line change
`@@ -540,6 +540,11 @@ class ALSModel private[ml] (`
`540`	`540`	`}`
`541`	`541`	`}`
`542`	`542`
	`543`	`+ override def estimatedSize: Long = {`
	`544`	`+ val userCount = userFactors.count()`
	`545`	`+ val itemCount = itemFactors.count()`
	`546`	`+ (userCount + itemCount) * (rank + 1) * 4`
	`547`	`+ }`
`543`	`548`	`}`
`544`	`549`
`545`	`550`	`@Since("1.6.0")`
`@@ -771,6 +776,13 @@ class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel]`
`771`	`776`
`772`	`777`	`@Since("1.5.0")`
`773`	`778`	`override def copy(extra: ParamMap): ALS = defaultCopy(extra)`
	`779`	`+`
	`780`	`+ override def estimateModelSize(dataset: Dataset[_]): Long = {`
	`781`	`+ val userCount = dataset.select(getUserCol).distinct().count()`
	`782`	`+ val itemCount = dataset.select(getItemCol).distinct().count()`
	`783`	`+ val rank = getRank`
	`784`	`+ (userCount + itemCount) * (rank + 1) * 4`
	`785`	`+ }`
`774`	`786`	`}`
`775`	`787`
`776`	`788`