[SPARK-51747][SQL] Data source cached plan should respect options

asl3 · gengliangwang · gengliangwang · commit d2a864f988c7 · 2025-04-09T20:10:42.000-07:00
### What changes were proposed in this pull request? Data source cached plan should respect options, such as CSV delimiter. Before this, DataSourceStrategy caches the first plan and reuses it in the future, ignoring updated options. This change returns a **new plan** if options are changed. ### Why are the changes needed? For example: ``` spark.sql("CREATE TABLE t(a string, b string) USING CSV".stripMargin) spark.sql("INSERT INTO TABLE t VALUES ('a;b', 'c')") spark.sql("SELECT * FROM t").show() spark.sql("SELECT * FROM t WITH ('delimiter' = ';')") ``` Expected output: ``` +----+----+ |col1|col2| +----+----+ | a;b| c| +----+----+ +----+----+ |col1|col2| +----+----+ | a| b,c| +----+----+ ``` Output before this PR: ``` +----+----+ |col1|col2| +----+----+ | a;b| c| +----+----+ +----+----+ |col1|col2| +----+----+ | a;b| c| +----+----+ ``` The PR is needed to get the expected result. ### Does this PR introduce _any_ user-facing change? Yes, corrects the caching behavior from DataSourceStrategy ### How was this patch tested? Added test in DDLSuite.scala ### Was this patch authored or co-authored using generative AI tooling? No Closes #50538 from asl3/asl3/datasourcestrategycacheoptions. Lead-authored-by: Amanda Liu <amanda.liu@databricks.com> Co-authored-by: Gengliang Wang <gengliang@apache.org> Signed-off-by: Gengliang Wang <gengliang@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -21,6 +21,7 @@ import java.util.Locale
 
 import scala.collection.immutable.ListMap
 import scala.collection.mutable
+import scala.jdk.CollectionConverters._
 
 import org.apache.hadoop.fs.Path
 
@@ -256,20 +257,35 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
       QualifiedTableName(table.identifier.catalog.get, table.database, table.identifier.table)
     val catalog = sparkSession.sessionState.catalog
     val dsOptions = DataSourceUtils.generateDatasourceOptions(extraOptions, table)
-    catalog.getCachedPlan(qualifiedTableName, () => {
-      val dataSource =
-        DataSource(
-          sparkSession,
-          // In older version(prior to 2.1) of Spark, the table schema can be empty and should be
-          // inferred at runtime. We should still support it.
-          userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
-          partitionColumns = table.partitionColumnNames,
-          bucketSpec = table.bucketSpec,
-          className = table.provider.get,
-          options = dsOptions,
-          catalogTable = Some(table))
-      LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table)
-    })
+    catalog.getCachedTable(qualifiedTableName) match {
+      case null =>
+        val dataSource =
+          DataSource(
+            sparkSession,
+            // In older version(prior to 2.1) of Spark, the table schema can be empty and should be
+            // inferred at runtime. We should still support it.
+            userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
+            partitionColumns = table.partitionColumnNames,
+            bucketSpec = table.bucketSpec,
+            className = table.provider.get,
+            options = dsOptions,
+            catalogTable = Some(table))
+        val plan = LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table)
+        catalog.cacheTable(qualifiedTableName, plan)
+        plan
+
+      // If the cached table relation's options differ from the new options:
+      // 1. Create a new HadoopFsRelation with updated options
+      // 2. Return a new LogicalRelation with the updated HadoopFsRelation
+      // This ensures the relation reflects any changes in data source options
+      case r @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _, _)
+        if new CaseInsensitiveStringMap(fsRelation.options.asJava) !=
+          new CaseInsensitiveStringMap(dsOptions.asJava) =>
+        val newFsRelation = fsRelation.copy(options = dsOptions)(sparkSession)
+        r.copy(relation = newFsRelation)
+
+      case other => other
+    }
   }
 
   private def getStreamingRelation(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1376,6 +1376,35 @@ abstract class DDLSuite extends QueryTest with DDLSuiteBase {
     }
   }
 
+  test("SPARK-51747: Data source cached plan should respect options") {
+    withTable("t") {
+      spark.sql("CREATE TABLE t(a string, b string) USING CSV".stripMargin)
+      spark.sql("INSERT INTO TABLE t VALUES ('a;b', 'c')")
+      spark.sql("INSERT INTO TABLE t VALUES ('hello; world', 'test')")
+
+      // check initial contents of table
+      checkAnswer(spark.table("t"), Row("a;b", "c") :: Row("hello; world", "test") :: Nil)
+
+      // no option
+      checkAnswer(
+        spark.sql("SELECT * FROM t"),
+        Row("a;b", "c") :: Row("hello; world", "test") :: Nil
+      )
+
+      // respect delimiter option
+      checkAnswer(
+        spark.sql("SELECT * FROM t WITH ('delimiter' = ';')"),
+        Row("a", "b,c") :: Row("hello", " world,test") :: Nil
+      )
+
+      // respect lineSep option
+      checkAnswer(
+        spark.sql("SELECT * FROM t WITH ('lineSep' = ';')"),
+        Row("a", null) :: Row("b", "c\n") :: Row("hello", null) :: Row(" world", "test\n") :: Nil
+      )
+    }
+  }
+
   test("SPARK-18009 calling toLocalIterator on commands") {
     import scala.jdk.CollectionConverters._
     val df = sql("show databases")