Skip to content

Commit 8718eba

Browse files
mihailotim-dbcloud-fan
authored andcommitted
Revert "[SPARK-47895][SQL] group by alias should be idempotent"
### What changes were proposed in this pull request? This PR reverts #50461 because it introduces a correctness issue by replacing an `Alias` with incorrect literal. A followup will be made with a different way to fix this issue. ### Why are the changes needed? In the below example, alias `abc` is replaced with `2` resulting in 2 rows instead of correct 3. Before erronous PR: ![image](https://github.com/user-attachments/assets/dcc98323-369d-4f5e-b0ad-de5b76ffc5c3) After erronous PR: ![image](https://github.com/user-attachments/assets/31c24125-6654-48f8-9b55-40a6a667ed23) ### Does this PR introduce _any_ user-facing change? User now sees an error message instead of an incorrect result. ### How was this patch tested? Added a test case to check for this behavior in the future ### Was this patch authored or co-authored using generative AI tooling? No Closes #50567 from mihailotim-db/mihailotim-db/revert_group_by. Authored-by: Mihailo Timotic <mihailo.timotic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent c3af3dc commit 8718eba

File tree

5 files changed

+21
-31
lines changed

5 files changed

+21
-31
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -116,19 +116,7 @@ class ResolveReferencesInAggregate(val catalogManager: CatalogManager) extends S
116116
groupExprs.map { g =>
117117
g.transformWithPruning(_.containsPattern(UNRESOLVED_ATTRIBUTE)) {
118118
case u: UnresolvedAttribute =>
119-
val (result, index) =
120-
selectList.zipWithIndex.find(ne => conf.resolver(ne._1.name, u.name))
121-
.getOrElse((u, -1))
122-
123-
trimAliases(result) match {
124-
// HACK ALERT: If the expanded grouping expression is an integer literal, don't use it
125-
// but use an integer literal of the index. The reason is we may
126-
// repeatedly analyze the plan, and the original integer literal may cause
127-
// failures with a later GROUP BY ordinal resolution. GROUP BY constant is
128-
// meaningless so whatever value does not matter here.
129-
case IntegerLiteral(_) => Literal(index + 1)
130-
case _ => result
131-
}
119+
selectList.find(ne => conf.resolver(ne.name, u.name)).getOrElse(u)
132120
}
133121
}
134122
} else {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -104,22 +104,4 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest {
104104
testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
105105
)
106106
}
107-
108-
test("SPARK-47895: group by alias repeated analysis") {
109-
val plan = testRelation.groupBy($"b")(Literal(100).as("b")).analyze
110-
comparePlans(
111-
plan,
112-
testRelation.groupBy(Literal(1))(Literal(100).as("b"))
113-
)
114-
115-
val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any))))
116-
// Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply.
117-
val copiedPlan = plan.transform {
118-
case _: LocalRelation => testRelationWithData
119-
}
120-
comparePlans(
121-
copiedPlan.analyze, // repeated analysis
122-
testRelationWithData.groupBy(Literal(1))(Literal(100).as("b"))
123-
)
124-
}
125107
}

sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,13 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
331331
}
332332

333333

334+
-- !query
335+
SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc
336+
-- !query analysis
337+
Aggregate [(col1#x % 3)], [max(col1#x) AS max(col1)#x, 3 AS abc#x]
338+
+- LocalRelation [col1#x]
339+
340+
334341
-- !query
335342
set spark.sql.groupByAliases=false
336343
-- !query analysis

sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k;
4343
-- Aggregate functions cannot be used in GROUP BY
4444
SELECT COUNT(b) AS k FROM testData GROUP BY k;
4545

46+
-- Ordinal is replaced correctly when grouping by alias of a literal
47+
SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc;
48+
4649
-- turn off group by aliases
4750
set spark.sql.groupByAliases=false;
4851

sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,16 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
277277
}
278278

279279

280+
-- !query
281+
SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc
282+
-- !query schema
283+
struct<max(col1):int,abc:int>
284+
-- !query output
285+
2 3
286+
3 3
287+
4 3
288+
289+
280290
-- !query
281291
set spark.sql.groupByAliases=false
282292
-- !query schema

0 commit comments

Comments
 (0)