opensearch-project · dai-chen · May 13, 2025 · Apr 2, 2025 · Apr 9, 2025 · Apr 11, 2025
@@ -394,6 +394,7 @@ User can provide the following options in `WITH` clause of create statement:
 + `watermark_delay`: a string as time expression for how late data can come and still be processed, e.g. 1 minute, 10 seconds. This is required by auto and incremental refresh on materialized view if it has aggregation in the query.
 + `output_mode`: a mode string that describes how data will be written to streaming sink. If unspecified, default append mode will be applied.
 + `index_settings`: a JSON string as index settings for OpenSearch index that will be created. Please follow the format in OpenSearch documentation. If unspecified, default OpenSearch index settings will be applied.
++ `index_mappings`: A JSON string specifying additional OpenSearch index mappings, such as metadata fields (e.g., _source) or mapping parameters (e.g., enabled, index, etc.). This allows customizing certain parts of the index mappings. The base mappings are automatically generated; if unspecified, only the defaults will be applied. Refer to [OpenSearch metadata fields](https://docs.opensearch.org/docs/latest/field-types/metadata-fields/source/) and [mapping parameters](https://docs.opensearch.org/docs/latest/field-types/mapping-parameters/index/) for supported options.
 + `id_expression`: an expression string that generates an ID column to guarantee idempotency when index refresh job restart or any retry attempt during an index refresh. If an empty string is provided, no ID column will be generated.
 + `extra_options`: a JSON string as extra options that can be passed to Spark streaming source and sink API directly. Use qualified source table name (because there could be multiple) and "sink", e.g. '{"sink": "{key: val}", "table1": {key: val}}'
 
@@ -407,6 +408,7 @@ WITH (
   watermark_delay = '1 Second',
   output_mode = 'complete',
   index_settings = '{"number_of_shards": 2, "number_of_replicas": 3}',
+  index_mappings = '{ "_source": { "enabled": false } }',
   id_expression = "sha1(concat_ws('\0',startTime,status))",
   extra_options = '{"spark_catalog.default.alb_logs": {"maxFilesPerTrigger": "1"}}'
 )

@@ -7,16 +7,18 @@ package org.opensearch.flint.core.storage
 
 import java.util
 
-import scala.collection.JavaConverters.mapAsJavaMapConverter
+import scala.collection.JavaConverters._
+import scala.collection.convert.ImplicitConversions.`map AsScala`
 
 import org.opensearch.client.RequestOptions
 import org.opensearch.client.indices.{GetIndexRequest, GetIndexResponse, PutMappingRequest}
-import org.opensearch.common.xcontent.XContentType
+import org.opensearch.common.xcontent.{XContentParser, XContentType}
 import org.opensearch.flint.common.FlintVersion
 import org.opensearch.flint.common.metadata.{FlintIndexMetadataService, FlintMetadata}
 import org.opensearch.flint.core.FlintOptions
 import org.opensearch.flint.core.IRestHighLevelClient
 import org.opensearch.flint.core.metadata.FlintJsonHelper._
+import org.slf4j.LoggerFactory
 
 import org.apache.spark.internal.Logging
 
@@ -99,7 +101,7 @@ class FlintOpenSearchIndexMetadataService(options: FlintOptions)
 }
 
 object FlintOpenSearchIndexMetadataService {
-
+  private val logger = LoggerFactory.getLogger(this.getClass)
   def serialize(metadata: FlintMetadata): String = {
     serialize(metadata, true)
   }
@@ -134,9 +136,21 @@ object FlintOpenSearchIndexMetadataService {
             optionalObjectField(builder, "properties", metadata.properties)
           }
         }
+        // Add _source field
+        val indexMappingsOpt =
+          Option(metadata.options.get("index_mappings")).map(_.asInstanceOf[String])
+        val sourceEnabled = extractSourceEnabled(indexMappingsOpt)
+        if (!sourceEnabled) {
+          objectField(builder, "_source") {
+            builder.field("enabled", sourceEnabled)
+          }
+        }
 
         // Add properties (schema) field
-        builder.field("properties", metadata.schema)
+        val tempSchema = metadata.schema.asScala.toMap
+        val tempOptions = metadata.options.asScala.toMap
+        val schema = mergeSchema(tempSchema, tempOptions).asJava
+        builder.field("properties", schema)
       })
     } catch {
       case e: Exception =>
@@ -191,6 +205,7 @@ object FlintOpenSearchIndexMetadataService {
                   }
                 }
               }
+            case "_source" => parser.skipChildren()
             case "properties" =>
               builder.schema(parser.map())
             case _ => // Ignore other fields, for instance, dynamic.
@@ -203,4 +218,169 @@ object FlintOpenSearchIndexMetadataService {
         throw new IllegalStateException("Failed to parse metadata JSON", e)
     }
   }
+
+  def extractSourceEnabled(indexMappingsJsonOpt: Option[String]): Boolean = {
+    var sourceEnabled: Boolean = true
+
+    indexMappingsJsonOpt.foreach { jsonStr =>
+      try {
+        parseJson(jsonStr) { (parser, fieldName) =>
+          fieldName match {
+            case "_source" =>
+              parseObjectField(parser) { (parser, innerFieldName) =>
+                innerFieldName match {
+                  case "enabled" =>
+                    sourceEnabled = parser.booleanValue()
+                    return sourceEnabled
+                  case _ => // Ignore
+                }
+              }
+            case _ => // Ignore
+          }
+        }
+      } catch {
+        case _: Exception => // Swallow
+      }
+    }
+
+    sourceEnabled
+  }
+
+  /**
+   * Merges the mapping parameters from FlintSparkIndexOptions into the existing schema. If the
+   * options contain mapping parameters that exist in allFieldTypes, those configurations are
+   * merged.
+   *
+   * @param allFieldTypes
+   *   Map of field names to their type/configuration details
+   * @param options
+   *   FlintMetadata containing potential mapping parameters
+   * @return
+   *   Merged map with combined mapping parameters
+   */
+  def mergeSchema(
+      allFieldTypes: Map[String, AnyRef],
+      options: Map[String, AnyRef]): Map[String, AnyRef] = {
+    val indexMappingsOpt = options.get("index_mappings").flatMap {
+      case s: String => Some(s)
+      case _ => None
+    }
+
+    var result = allFieldTypes
+
+    // Track mappings from leaf name to configuration properties
+    var fieldConfigs = Map.empty[String, Map[String, AnyRef]]
+
+    indexMappingsOpt.foreach { jsonStr =>
+      try {
+        // Extract nested field configurations - key is the leaf name
+        parseJson(jsonStr) { (parser, fieldName) =>
+          fieldName match {
+            case "_source" =>
+              parser.skipChildren() // Skip _source section
+
+            case "properties" =>
+              // Process properties recursively to extract field configs
+              fieldConfigs = extractNestedProperties(parser)
+
+            case _ =>
+              parser.skipChildren() // Skip other fields
+          }
+        }
+
+        // Apply extracted configurations to schema while preserving structure
+        result = result.map { case (fullFieldName, fieldType) =>
+          val leafFieldName = extractLeafFieldName(fullFieldName)
+
+          if (fieldConfigs.contains(leafFieldName)) {
+            // We have config for this leaf field name
+            fieldType match {
+              case existingConfig: java.util.Map[_, _] =>
+                val mergedConfig = new java.util.HashMap[String, AnyRef](
+                  existingConfig.asInstanceOf[java.util.Map[String, AnyRef]])
+
+                // Add/overwrite with new config values
+                fieldConfigs(leafFieldName).foreach { case (k, v) =>
+                  mergedConfig.put(k, v)
+                }
+
+                // Return the updated field with its original key
+                (fullFieldName, mergedConfig)
+
+              case _ =>
+                // If field type isn't a map, keep it unchanged
+                (fullFieldName, fieldType)
+            }
+          } else {
+            // No config for this field, keep it unchanged
+            (fullFieldName, fieldType)
+          }
+        }
+      } catch {
+        case ex: Exception =>
+          logger.error(s"Error merging schema: ${ex.getMessage}")
+      }
+    }
+
+    result
+  }
+
+  /**
+   * Recursively extracts mapping parameters from nested properties structure. Returns a map of
+   * field name to its configuration.
+   */
+  private def extractNestedProperties(
+      parser: XContentParser): Map[String, Map[String, AnyRef]] = {
+    var fieldConfigs = Map.empty[String, Map[String, AnyRef]]
+
+    parseObjectField(parser) { (parser, fieldName) =>
+      var fieldConfig = Map.empty[String, AnyRef]
+      var hasNestedProperties = false
+
+      parseObjectField(parser) { (parser, propName) =>
+        propName match {
+          case "properties" =>
+            // This field has nested properties - recurse
+            hasNestedProperties = true
+            val nestedConfigs = extractNestedProperties(parser)
+            fieldConfigs ++= nestedConfigs
+
+          case "type" =>
+            fieldConfig += ("type" -> parser.text().asInstanceOf[AnyRef])
+
+          case "format" =>
+            fieldConfig += ("format" -> parser.text().asInstanceOf[AnyRef])
+
+          case "index" =>
+            fieldConfig += ("index" -> java.lang.Boolean
+              .valueOf(parser.booleanValue())
+              .asInstanceOf[AnyRef])
+
+          case _ =>
+            // Skip any unrecognized properties
+            parser.skipChildren()
+        }
+      }
+
+      // If this is a leaf field (no nested properties), add its config
+      if (!hasNestedProperties && fieldConfig.nonEmpty) {
+        fieldConfigs += (fieldName -> fieldConfig)
+      }
+    }
+
+    fieldConfigs
+  }
+
+  /**
+   * Extracts the leaf field name from a potentially nested field path. For example:
+   * "aws.vpc.count" -> "count"
+   */
+  private def extractLeafFieldName(fullFieldPath: String): String = {
+    val lastDotIndex = fullFieldPath.lastIndexOf('.')
+    if (lastDotIndex >= 0) {
+      fullFieldPath.substring(lastDotIndex + 1)
+    } else {
+      fullFieldPath
+    }
+  }
 }