Skip to content

Commit 60371b5

Browse files
flaming-archerpan3793
authored andcommitted
[KYUUBI #7122] Support ORC hive table pushdown filter
### Why are the changes needed? Previously, the `HiveScan` class was used to read data. If it is determined to be ORC type, the `ORCScan` from Spark datasourcev2 can be used. `ORCScan` supports pushfilter down, but `HiveScan` does not yet support it. In our testing, we are able to achieve approximately 2x performance improvement. The conversation can be controlled by setting `spark.sql.kyuubi.hive.connector.read.convertMetastoreOrc`. When enabled, the data source ORC reader is used to process ORC tables created by using the HiveQL syntax, instead of Hive SerDe. close #7122 ### How was this patch tested? added unit test ### Was this patch authored or co-authored using generative AI tooling? No Closes #7123 from flaming-archer/master_scanbuilder_new. Closes #7122 c3f412f [tian bao] add case _ 2be4890 [tian bao] Merge branch 'master_scanbuilder_new' of github.com:flaming-archer/kyuubi into master_scanbuilder_new c825d0f [tian bao] review change 8a26d6a [tian bao] Update extensions/spark/kyuubi-spark-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/KyuubiHiveConnectorConf.scala 68d4196 [tian bao] review change bed007f [tian bao] review change b89e6e6 [tian bao] Optimize UT 5a8941b [tian bao] fix failed ut dc1ba47 [tian bao] orc pushdown version 0 Authored-by: tian bao <2011xuesong@gmail.com> Signed-off-by: Cheng Pan <chengpan@apache.org>
1 parent 8492818 commit 60371b5

File tree

7 files changed

+170
-17
lines changed

7 files changed

+170
-17
lines changed

extensions/spark/kyuubi-spark-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/HiveTable.scala

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.kyuubi.spark.connector.hive
1919

2020
import java.util
21+
import java.util.Locale
2122

2223
import scala.collection.JavaConverters._
2324
import scala.collection.mutable
@@ -31,10 +32,12 @@ import org.apache.spark.sql.connector.catalog.TableCapability.{BATCH_READ, BATCH
3132
import org.apache.spark.sql.connector.expressions.Transform
3233
import org.apache.spark.sql.connector.read.ScanBuilder
3334
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
35+
import org.apache.spark.sql.execution.datasources.v2.orc.OrcScanBuilder
3436
import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper.{BucketSpecHelper, LogicalExpressions}
3537
import org.apache.spark.sql.types.StructType
3638
import org.apache.spark.sql.util.CaseInsensitiveStringMap
3739

40+
import org.apache.kyuubi.spark.connector.hive.KyuubiHiveConnectorConf.READ_CONVERT_METASTORE_ORC
3841
import org.apache.kyuubi.spark.connector.hive.read.{HiveCatalogFileIndex, HiveScanBuilder}
3942
import org.apache.kyuubi.spark.connector.hive.write.HiveWriteBuilder
4043

@@ -59,6 +62,20 @@ case class HiveTable(
5962
catalogTable.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize))
6063
}
6164

65+
lazy val convertedProvider: Option[String] = {
66+
val serde = catalogTable.storage.serde.getOrElse("").toUpperCase(Locale.ROOT)
67+
val parquet = serde.contains("PARQUET")
68+
val orc = serde.contains("ORC")
69+
val provider = catalogTable.provider.map(_.toUpperCase(Locale.ROOT))
70+
if (orc || provider.contains("ORC")) {
71+
Some("ORC")
72+
} else if (parquet || provider.contains("PARQUET")) {
73+
Some("PARQUET")
74+
} else {
75+
None
76+
}
77+
}
78+
6279
override def name(): String = catalogTable.identifier.unquotedString
6380

6481
override def schema(): StructType = catalogTable.schema
@@ -77,7 +94,11 @@ case class HiveTable(
7794
}
7895

7996
override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
80-
HiveScanBuilder(sparkSession, fileIndex, dataSchema, catalogTable)
97+
convertedProvider match {
98+
case Some("ORC") if sparkSession.sessionState.conf.getConf(READ_CONVERT_METASTORE_ORC) =>
99+
OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
100+
case _ => HiveScanBuilder(sparkSession, fileIndex, dataSchema, catalogTable)
101+
}
81102
}
82103

83104
override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {

extensions/spark/kyuubi-spark-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/KyuubiHiveConnectorConf.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ package org.apache.kyuubi.spark.connector.hive
1919

2020
import java.util.Locale
2121

22+
import org.apache.spark.sql.internal.SQLConf.buildConf
23+
2224
object KyuubiHiveConnectorConf {
2325

2426
import org.apache.spark.sql.internal.SQLConf.buildStaticConf
@@ -39,4 +41,12 @@ object KyuubiHiveConnectorConf {
3941
"Invalid value for 'spark.sql.kyuubi.hive.connector.externalCatalog.share.policy'." +
4042
"Valid values are 'ONE_FOR_ONE', 'ONE_FOR_ALL'.")
4143
.createWithDefault(OneForAllPolicy.name)
44+
45+
val READ_CONVERT_METASTORE_ORC =
46+
buildConf("spark.sql.kyuubi.hive.connector.read.convertMetastoreOrc")
47+
.doc("When enabled, the data source ORC reader is used to process " +
48+
"ORC tables created by using the HiveQL syntax, instead of Hive SerDe.")
49+
.version("1.11.0")
50+
.booleanConf
51+
.createWithDefault(true)
4252
}

extensions/spark/kyuubi-spark-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileIndex.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ class HiveCatalogFileIndex(
5454

5555
override def partitionSchema: StructType = table.partitionSchema
5656

57+
override def listFiles(
58+
partitionFilters: Seq[Expression],
59+
dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
60+
val fileIndex = filterPartitions(partitionFilters)
61+
fileIndex.listFiles(partitionFilters, dataFilters)
62+
}
63+
5764
private[hive] def listHiveFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression])
5865
: (Seq[PartitionDirectory], Map[PartitionDirectory, CatalogTablePartition]) = {
5966
val fileIndex = filterPartitions(partitionFilters)

extensions/spark/kyuubi-spark-connector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/HiveCatalogSuite.scala

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,13 @@ import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, NoSuchT
3131
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
3232
import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
3333
import org.apache.spark.sql.connector.expressions.Transform
34+
import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
3435
import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper._
3536
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
3637
import org.apache.spark.sql.util.CaseInsensitiveStringMap
3738

3839
import org.apache.kyuubi.spark.connector.hive.HiveTableCatalog.IdentifierHelper
40+
import org.apache.kyuubi.spark.connector.hive.KyuubiHiveConnectorConf.READ_CONVERT_METASTORE_ORC
3941
import org.apache.kyuubi.spark.connector.hive.read.HiveScan
4042

4143
class HiveCatalogSuite extends KyuubiHiveTest {
@@ -355,8 +357,29 @@ class HiveCatalogSuite extends KyuubiHiveTest {
355357
val orcProps: util.Map[String, String] = new util.HashMap[String, String]()
356358
orcProps.put(TableCatalog.PROP_PROVIDER, "orc")
357359
val ot = catalog.createTable(orc_table, schema, Array.empty[Transform], orcProps)
358-
val orcScan = ot.asInstanceOf[HiveTable]
359-
.newScanBuilder(CaseInsensitiveStringMap.empty()).build().asInstanceOf[HiveScan]
360-
assert(orcScan.isSplitable(new Path("empty")))
360+
361+
Seq("true", "false").foreach { value =>
362+
withSparkSession(Map(READ_CONVERT_METASTORE_ORC.key -> value)) { _ =>
363+
val scan = ot.asInstanceOf[HiveTable]
364+
.newScanBuilder(CaseInsensitiveStringMap.empty()).build()
365+
366+
val orcScan = value match {
367+
case "true" =>
368+
assert(
369+
scan.isInstanceOf[OrcScan],
370+
s"Expected OrcScan, got ${scan.getClass.getSimpleName}")
371+
scan.asInstanceOf[OrcScan]
372+
case "false" =>
373+
assert(
374+
scan.isInstanceOf[HiveScan],
375+
s"Expected HiveScan, got ${scan.getClass.getSimpleName}")
376+
scan.asInstanceOf[HiveScan]
377+
case _ =>
378+
throw new IllegalArgumentException(
379+
s"Unexpected value: '$value'. Only 'true' or 'false' are allowed.")
380+
}
381+
assert(orcScan.isSplitable(new Path("empty")))
382+
}
383+
}
361384
}
362385
}

extensions/spark/kyuubi-spark-connector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/HiveQuerySuite.scala

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,99 @@ class HiveQuerySuite extends KyuubiHiveTest {
260260
}
261261
}
262262

263+
test("ORC filter pushdown") {
264+
val table = "hive.default.orc_filter_pushdown"
265+
withTable(table) {
266+
spark.sql(
267+
s"""
268+
| CREATE TABLE $table (
269+
| id INT,
270+
| data STRING,
271+
| value INT
272+
| ) PARTITIONED BY (dt STRING, region STRING)
273+
| STORED AS ORC
274+
| """.stripMargin).collect()
275+
276+
// Insert test data with partitions
277+
spark.sql(
278+
s"""
279+
| INSERT INTO $table PARTITION (dt='2024-01-01', region='east')
280+
| VALUES (1, 'a', 100), (2, 'b', 200), (11, 'aa', 100), (22, 'b', 200)
281+
|""".stripMargin)
282+
283+
spark.sql(
284+
s"""
285+
| INSERT INTO $table PARTITION (dt='2024-01-01', region='west')
286+
| VALUES (3, 'c', 300), (4, 'd', 400), (33, 'cc', 300), (44, 'dd', 400)
287+
|""".stripMargin)
288+
spark.sql(
289+
s"""
290+
| INSERT INTO $table PARTITION (dt='2024-01-02', region='east')
291+
| VALUES (5, 'e', 500), (6, 'f', 600), (55, 'ee', 500), (66, 'ff', 600)
292+
| """.stripMargin)
293+
294+
// Test multiple partition filters
295+
val df1 = spark.sql(
296+
s"""
297+
| SELECT * FROM $table
298+
| WHERE dt = '2024-01-01' AND region = 'east' AND value > 1500
299+
|""".stripMargin)
300+
assert(df1.count() === 0)
301+
302+
// Test multiple partition filters
303+
val df2 = spark.sql(
304+
s"""
305+
| SELECT * FROM $table
306+
| WHERE dt = '2024-01-01' AND region = 'east' AND value > 150
307+
|""".stripMargin)
308+
assert(df2.count() === 2)
309+
assert(df2.collect().map(_.getInt(0)).toSet === Set(2, 22))
310+
311+
// Test explain
312+
val df3 = spark.sql(
313+
s"""
314+
| EXPLAIN SELECT count(*) as total_rows
315+
| FROM $table
316+
| WHERE dt = '2024-01-01' AND region = 'east' AND value > 1
317+
|""".stripMargin)
318+
assert(df3.count() === 1)
319+
// contains like : PushedFilters: [IsNotNull(value), GreaterThan(value,1)]
320+
assert(df3.collect().map(_.getString(0)).filter { s =>
321+
s.contains("PushedFilters") && !s.contains("PushedFilters: []")
322+
}.toSet.size == 1)
323+
324+
// Test aggregation pushdown partition filters
325+
spark.conf.set("spark.sql.orc.aggregatePushdown", true)
326+
327+
// Test aggregation pushdown partition filters
328+
val df4 = spark.sql(
329+
s"""
330+
| SELECT count(*) as total_rows
331+
| FROM $table
332+
| WHERE dt = '2024-01-01' AND region = 'east'
333+
| group by dt, region
334+
| """.stripMargin)
335+
assert(df4.count() === 1)
336+
assert(df4.collect().map(_.getLong(0)).toSet === Set(4L))
337+
338+
val df5 = spark.sql(
339+
s"""
340+
| EXPLAIN SELECT count(*) as total_rows
341+
| FROM $table
342+
| WHERE dt = '2024-01-01' AND region = 'east'
343+
| group by dt, region
344+
| """.stripMargin)
345+
assert(df5.count() === 1)
346+
// contains like : PushedAggregation: [COUNT(*)],
347+
assert(df5.collect().map(_.getString(0)).filter { s =>
348+
s.contains("PushedAggregation") && !s.contains("PushedAggregation: []")
349+
}.toSet.size == 1)
350+
351+
spark.conf.set("spark.sql.orc.aggregatePushdown", false)
352+
353+
}
354+
}
355+
263356
private def readPartitionedTable(format: String, hiveTable: Boolean): Unit = {
264357
withSparkSession() { spark =>
265358
val table = "hive.default.employee"

extensions/spark/kyuubi-spark-connector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/KyuubiHiveTest.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import org.apache.spark.SparkConf
2121
import org.apache.spark.internal.Logging
2222
import org.apache.spark.sql.{QueryTest, SparkSession}
2323
import org.apache.spark.sql.connector.catalog.{SupportsNamespaces, TableCatalog}
24+
import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper.Utils
2425

2526
import org.apache.kyuubi.spark.connector.common.LocalSparkSession
2627

@@ -77,5 +78,16 @@ abstract class KyuubiHiveTest extends QueryTest with Logging {
7778
f(innerSpark)
7879
}
7980

81+
/**
82+
* Drops table `tableName` after calling `f`.
83+
*/
84+
protected def withTable(tableNames: String*)(f: => Unit): Unit = {
85+
Utils.tryWithSafeFinally(f) {
86+
tableNames.foreach { name =>
87+
spark.sql(s"DROP TABLE IF EXISTS $name")
88+
}
89+
}
90+
}
91+
8092
override def spark: SparkSession = innerSpark
8193
}

extensions/spark/kyuubi-spark-connector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/command/DDLCommandTestUtils.scala

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -78,19 +78,6 @@ trait DDLCommandTestUtils extends KyuubiHiveTest {
7878
fs.makeQualified(hadoopPath).toUri
7979
}
8080

81-
/**
82-
* Drops table `tableName` after calling `f`.
83-
*/
84-
protected def withTable(tableNames: String*)(f: => Unit): Unit = {
85-
try {
86-
f
87-
} finally {
88-
tableNames.foreach { name =>
89-
spark.sql(s"DROP TABLE IF EXISTS $name")
90-
}
91-
}
92-
}
93-
9481
protected def withNamespaceAndTable(
9582
ns: String,
9683
tableName: String,

0 commit comments

Comments
 (0)