Revert "[SPARK-47895][SQL] group by alias should be idempotent"

mihailotim-db · cloud-fan · commit 8718eba1f6f7 · 2025-04-14T12:16:37.000+08:00
### What changes were proposed in this pull request? This PR reverts #50461 because it introduces a correctness issue by replacing an `Alias` with incorrect literal. A followup will be made with a different way to fix this issue. ### Why are the changes needed? In the below example, alias `abc` is replaced with `2` resulting in 2 rows instead of correct 3. Before erronous PR: ![image](https://github.com/user-attachments/assets/dcc98323-369d-4f5e-b0ad-de5b76ffc5c3) After erronous PR: ![image](https://github.com/user-attachments/assets/31c24125-6654-48f8-9b55-40a6a667ed23) ### Does this PR introduce _any_ user-facing change? User now sees an error message instead of an incorrect result. ### How was this patch tested? Added a test case to check for this behavior in the future ### Was this patch authored or co-authored using generative AI tooling? No Closes #50567 from mihailotim-db/mihailotim-db/revert_group_by. Authored-by: Mihailo Timotic <mihailo.timotic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
@@ -116,19 +116,7 @@ class ResolveReferencesInAggregate(val catalogManager: CatalogManager) extends S
       groupExprs.map { g =>
         g.transformWithPruning(_.containsPattern(UNRESOLVED_ATTRIBUTE)) {
           case u: UnresolvedAttribute =>
-            val (result, index) =
-              selectList.zipWithIndex.find(ne => conf.resolver(ne._1.name, u.name))
-                .getOrElse((u, -1))
-
-            trimAliases(result) match {
-              // HACK ALERT: If the expanded grouping expression is an integer literal, don't use it
-              //             but use an integer literal of the index. The reason is we may
-              //             repeatedly analyze the plan, and the original integer literal may cause
-              //             failures with a later GROUP BY ordinal resolution. GROUP BY constant is
-              //             meaningless so whatever value does not matter here.
-              case IntegerLiteral(_) => Literal(index + 1)
-              case _ => result
-            }
+            selectList.find(ne => conf.resolver(ne.name, u.name)).getOrElse(u)
         }
       }
     } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
@@ -104,22 +104,4 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest {
       testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
     )
   }
-
-  test("SPARK-47895: group by alias repeated analysis") {
-    val plan = testRelation.groupBy($"b")(Literal(100).as("b")).analyze
-    comparePlans(
-      plan,
-      testRelation.groupBy(Literal(1))(Literal(100).as("b"))
-    )
-
-    val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any))))
-    // Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply.
-    val copiedPlan = plan.transform {
-      case _: LocalRelation => testRelationWithData
-    }
-    comparePlans(
-      copiedPlan.analyze, // repeated analysis
-      testRelationWithData.groupBy(Literal(1))(Literal(100).as("b"))
-    )
-  }
 }
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out
@@ -331,6 +331,13 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc
+-- !query analysis
+Aggregate [(col1#x % 3)], [max(col1#x) AS max(col1)#x, 3 AS abc#x]
++- LocalRelation [col1#x]
+
+
 -- !query
 set spark.sql.groupByAliases=false
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql
@@ -43,6 +43,9 @@ SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k;
 -- Aggregate functions cannot be used in GROUP BY
 SELECT COUNT(b) AS k FROM testData GROUP BY k;
 
+-- Ordinal is replaced correctly when grouping by alias of a literal
+SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc;
+
 -- turn off group by aliases
 set spark.sql.groupByAliases=false;
 
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out
@@ -277,6 +277,16 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc
+-- !query schema
+struct<max(col1):int,abc:int>
+-- !query output
+2	3
+3	3
+4	3
+
+
 -- !query
 set spark.sql.groupByAliases=false
 -- !query schema