[SPARK-50812][ML][PYTHON][CONNECT] Add support PolynomialExpansion

wbo4958 · zhengruifeng · commit aa24a9a235b1 · 2025-01-28T12:15:08.000+08:00
### What changes were proposed in this pull request? Support PolynomialExpansion on connect ### Why are the changes needed? feature parity ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? CI passes ### Was this patch authored or co-authored using generative AI tooling? No Closes #49702 from wbo4958/px. Authored-by: Bobby Wang <wbo4958@gmail.com> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -36,6 +36,7 @@ org.apache.spark.ml.feature.FeatureHasher
 org.apache.spark.ml.feature.ElementwiseProduct
 org.apache.spark.ml.feature.HashingTF
 org.apache.spark.ml.feature.IndexToString
+org.apache.spark.ml.feature.PolynomialExpansion
 
 ########### Model for loading
 # classification
diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py
@@ -77,6 +77,7 @@
     MinHashLSH,
     MinHashLSHModel,
     IndexToString,
+    PolynomialExpansion,
 )
 from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
 from pyspark.sql import Row
@@ -85,6 +86,31 @@
 
 
 class FeatureTestsMixin:
+    def test_polynomial_expansion(self):
+        df = self.spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"])
+        px = PolynomialExpansion(degree=2)
+        px.setInputCol("dense")
+        px.setOutputCol("expanded")
+        self.assertTrue(
+            np.allclose(
+                px.transform(df).head().expanded.toArray(), [0.5, 0.25, 2.0, 1.0, 4.0], atol=1e-4
+            )
+        )
+
+        def check(p: PolynomialExpansion) -> None:
+            self.assertEqual(p.getInputCol(), "dense")
+            self.assertEqual(p.getOutputCol(), "expanded")
+            self.assertEqual(p.getDegree(), 2)
+
+        check(px)
+
+        # save & load
+        with tempfile.TemporaryDirectory(prefix="px") as d:
+            px.write().overwrite().save(d)
+            px2 = PolynomialExpansion.load(d)
+            self.assertEqual(str(px), str(px2))
+            check(px2)
+
     def test_index_string(self):
         dataset = self.spark.createDataFrame(
             [