diff --git a/docs/changelog/130337.yaml b/docs/changelog/130337.yaml new file mode 100644 index 0000000000000..2ea20ebd1944e --- /dev/null +++ b/docs/changelog/130337.yaml @@ -0,0 +1,6 @@ +pr: 130337 +summary: Add `XmlProcessor` initial implementation +area: Ingest Node +type: enhancement +issues: + - 97364 diff --git a/docs/reference/enrich-processor/index.md b/docs/reference/enrich-processor/index.md index e220e763024e3..fb2cac99ee355 100644 --- a/docs/reference/enrich-processor/index.md +++ b/docs/reference/enrich-processor/index.md @@ -159,6 +159,9 @@ Refer to [Enrich your data](docs-content://manage-data/ingest/transform-enrich/d [`split` processor](/reference/enrich-processor/split-processor.md) : Splits a field into an array of values. +[`xml` processor](/reference/enrich-processor/xml-processor.md) +: Parses XML documents and converts them to JSON objects. + [`trim` processor](/reference/enrich-processor/trim-processor.md) : Trims whitespace from field. diff --git a/docs/reference/enrich-processor/toc.yml b/docs/reference/enrich-processor/toc.yml index 7da271e6f0554..370c020f5f393 100644 --- a/docs/reference/enrich-processor/toc.yml +++ b/docs/reference/enrich-processor/toc.yml @@ -46,3 +46,4 @@ toc: - file: urldecode-processor.md - file: uri-parts-processor.md - file: user-agent-processor.md + - file: xml-processor.md diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md new file mode 100644 index 0000000000000..7452fea12ed65 --- /dev/null +++ b/docs/reference/enrich-processor/xml-processor.md @@ -0,0 +1,575 @@ +--- +navigation_title: "XML" +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/reference/current/xml-processor.html +--- + +# XML processor [xml-processor] + +Parses XML documents and converts them to JSON objects using a DOM parser. This processor efficiently handles XML data with a single-parse architecture that supports both structured output and XPath extraction for optimal performance. + +$$$xml-options$$$ + +| Name | Required | Default | Description | +| --- | --- | --- | --- | +| `field` | yes | - | The field containing the XML string to be parsed. | +| `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | +| `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored. | +| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | +| `ignore_failure` | no | `false` | Ignore failures for the processor. When `true` and XML parsing fails, adds `_xmlparsefailure` tag to the document. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | +| `ignore_empty_value` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | +| `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | +| `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | +| `parse_options` | no | - | Controls XML parsing behavior. Set to `"strict"` for strict XML validation that fails fast on invalid content. | +| `xpath` | no | - | Map of XPath expressions to target field names. Extracts values from the XML using XPath and stores them in the specified fields. | +| `namespaces` | no | - | Map of namespace prefixes to URIs for use with XPath expressions. Required when XPath expressions contain namespace prefixes. | +| `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | +| `if` | no | - | Conditionally execute the processor. See [Conditionally run a processor](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#conditionally-run-processor). | +| `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `tag` | no | - | Identifier for the processor. Useful for debugging and metrics. | + +## Configuration + +```js +{ + "xml": { + "field": "xml_field", + "target_field": "parsed_xml", + "ignore_empty_value": true + } +} +``` + +## Examples + +### Basic XML parsing + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels." + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels.", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions", + "review": "One of the great seminal American novels." + } + } + } + } + } + ] +} +``` + +### Filtering empty values + +When `ignore_empty_value` is set to `true`, the processor will remove empty elements from the parsed XML: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "target_field": "parsed_xml", + "ignore_empty_value": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content" + } + } + ] +} +``` + +Result with empty elements filtered out: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content", + "parsed_xml": { + "catalog": { + "book": { + "author": "William H. Gaddis", + "review": "One of the great seminal American novels.", + "nested": { + "valid_content": "Some content" + } + } + } + } + } + } + } + ] +} +``` + +### Converting element names to lowercase + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "to_lower": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "William H. GaddisThe Recognitions", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions" + } + } + } + } + } + ] +} +``` + +### Handling XML attributes + +XML attributes are included as properties in the resulting JSON object alongside element content: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "catalog": { + "version": "1.0", + "book": { + "id": "123", + "isbn": "978-0-684-80335-9", + "title": { + "lang": "en", + "#text": "The Recognitions" + }, + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + } + } + } + ] +} +``` + +### XPath extraction + +The XML processor can extract specific values using XPath expressions and store them in designated fields: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "store_xml": false, + "xpath": { + "//book/title/text()": "book_title", + "//book/author/text()": "book_author", + "//book/@id": "book_id" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell", + "book_title": ["The Recognitions", "1984"], + "book_author": ["William H. Gaddis", "George Orwell"], + "book_id": ["123", "456"] + } + } + } + ] +} +``` + +### XPath with namespaces + +When working with XML that uses namespaces, you need to configure namespace mappings: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "namespaces": { + "book": "http://example.com/book", + "author": "http://example.com/author" + }, + "xpath": { + "//book:catalog/book:item/book:title/text()": "titles", + "//author:info/@name": "author_names" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "titles": "The Recognitions", + "author_names": "William H. Gaddis", + "book:catalog": { + "book:item": { + "book:title": "The Recognitions", + "author:info": { + "name": "William H. Gaddis" + } + } + } + } + } + } + ] +} +``` + +### Force array behavior + +When `force_array` is true, all parsed values become arrays: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_array": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "catalog": [ + { + "book": [ + { + "title": ["The Recognitions"] + } + ] + } + ] + } + } + } + ] +} +``` + +### Strict parsing mode + +Use `parse_options: "strict"` for strict XML validation: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "parse_options": "strict", + "ignore_failure": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "Invalid XML with control character" + } + } + ] +} +``` + +Result (with parsing failure): + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "Invalid XML with control character", + "tags": ["_xmlparsefailure"] + } + } + } + ] +} +``` + +### Mixed content handling + +When XML contains mixed content (text interspersed with elements), text fragments are combined and stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "This text is bold and this is italic!" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "This text is bold and this is italic!", + "foo": { + "b": "bold", + "i": "italic", + "#text": "This text is and this is !" + } + } + } + } + ] +} +``` + +### Force content mode + +When `force_content` is `true`, all element text content is stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_content": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "book": { + "author": "William H. Gaddis", + "#text": "The Recognitions" + } + } + } + } + ] +} +``` + +## XML features + +The XML processor supports: + +- **Elements with text content**: Converted to key-value pairs where the element name is the key and text content is the value +- **Nested elements**: Converted to nested JSON objects +- **Empty elements**: Converted to `null` values (can be filtered with `ignore_empty_value`) +- **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level +- **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key +- **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties +- **Namespaces**: Namespace prefixes are preserved by default and can be used in XPath expressions with the `namespaces` configuration. Use `remove_namespaces: true` to strip namespace prefixes from element names diff --git a/modules/ingest-common/src/main/java/module-info.java b/modules/ingest-common/src/main/java/module-info.java index c3b3ab90892d9..84e30519d2d1b 100644 --- a/modules/ingest-common/src/main/java/module-info.java +++ b/modules/ingest-common/src/main/java/module-info.java @@ -20,6 +20,8 @@ requires org.apache.lucene.analysis.common; requires org.jruby.joni; + requires java.xml; + exports org.elasticsearch.ingest.common; // for painless opens org.elasticsearch.ingest.common to org.elasticsearch.painless.spi; // whitelist resource access diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java index 6e517d644cadb..0dc06e74af3bc 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java @@ -74,7 +74,8 @@ public Map getProcessors(Processor.Parameters paramet entry(TrimProcessor.TYPE, new TrimProcessor.Factory()), entry(URLDecodeProcessor.TYPE, new URLDecodeProcessor.Factory()), entry(UppercaseProcessor.TYPE, new UppercaseProcessor.Factory()), - entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()) + entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()), + entry(XmlProcessor.TYPE, new XmlProcessor.Factory()) ); } diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java new file mode 100644 index 0000000000000..3c2fac023fa20 --- /dev/null +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -0,0 +1,967 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.ingest.AbstractProcessor; +import org.elasticsearch.ingest.ConfigurationUtils; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Pattern; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; + +/** + * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. + * + * Features: + * - XML to JSON conversion with configurable structure options + * - XPath extraction with namespace support + * - Configurable options: force_array, force_content, remove_namespaces, to_lower + * - Strict parsing mode for XML validation + * - Empty value filtering with ignore_empty_value option + * - Logstash-compatible error handling and behavior + */ +public final class XmlProcessor extends AbstractProcessor { + + public static final String TYPE = "xml"; + + private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); + + // Pre-compiled pattern to detect namespace prefixes + private static final Pattern NAMESPACE_PATTERN = Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + + // Pre-configured SAX parser factories for secure XML parsing + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = + createSecureSaxParserFactoryNamespaceAwareStrict(); + + // Pre-configured document builder factory for DOM creation + private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); + + private final String field; + private final String targetField; + private final boolean ignoreMissing; + private final boolean ignoreFailure; + private final boolean toLower; + private final boolean ignoreEmptyValue; + private final boolean storeXml; + private final boolean removeNamespaces; + private final boolean forceContent; + private final boolean forceArray; + private final Map xpathExpressions; + private final Map namespaces; + private final Map compiledXPathExpressions; + private final String parseOptions; + + XmlProcessor( + String tag, + String description, + String field, + String targetField, + boolean ignoreMissing, + boolean ignoreFailure, + boolean toLower, + boolean ignoreEmptyValue, + boolean storeXml, + boolean removeNamespaces, + boolean forceContent, + boolean forceArray, + Map xpathExpressions, + Map namespaces, + String parseOptions + ) { + super(tag, description); + this.field = field; + this.targetField = targetField; + this.ignoreMissing = ignoreMissing; + this.ignoreFailure = ignoreFailure; + this.toLower = toLower; + this.ignoreEmptyValue = ignoreEmptyValue; + this.storeXml = storeXml; + this.removeNamespaces = removeNamespaces; + this.forceContent = forceContent; + this.forceArray = forceArray; + this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); + this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); + this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); + this.parseOptions = parseOptions != null ? parseOptions : ""; + } + + public String getField() { + return field; + } + + public String getTargetField() { + return targetField; + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public boolean isIgnoreEmptyValue() { + return ignoreEmptyValue; + } + + public boolean isStoreXml() { + return storeXml; + } + + public boolean isRemoveNamespaces() { + return removeNamespaces; + } + + public boolean isForceContent() { + return forceContent; + } + + public boolean isStrict() { + return "strict".equals(parseOptions); + } + + public boolean isForceArray() { + return forceArray; + } + + public boolean hasNamespaces() { + return namespaces.isEmpty() == false; + } + + public Map getNamespaces() { + return namespaces; + } + + public String getParseOptions() { + return parseOptions; + } + + @Override + public IngestDocument execute(IngestDocument document) { + Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); + + if (fieldValue == null) { + if (ignoreMissing || ignoreFailure) { + return document; + } + throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); + } + + if (fieldValue instanceof String == false) { + if (ignoreFailure) { + return document; + } + throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); + } + + String xmlString = (String) fieldValue; + try { + // Always use streaming parser for optimal performance and memory usage + if (storeXml || xpathExpressions.isEmpty() == false) { + parseXmlAndXPath(document, xmlString.trim()); + } + } catch (Exception e) { + if (ignoreFailure) { + // Add failure tag similar to Logstash behavior + document.appendFieldValue("tags", "_xmlparsefailure"); + return document; + } + throw new IllegalArgumentException("field [" + field + "] contains invalid XML: " + e.getMessage(), e); + } + + return document; + } + + @Override + public String getType() { + return TYPE; + } + + /** + * Determines if a value should be considered empty for filtering purposes. + * Used by the ignore_empty_value feature to filter out empty content. + * + * Considers empty: + * - null values + * - empty or whitespace-only strings + * - empty Maps + * - empty Lists + * + * @param value the value to check + * @return true if the value should be considered empty + */ + private boolean isEmptyValue(Object value) { + if (value == null) { + return true; + } + if (value instanceof String) { + return ((String) value).trim().isEmpty(); + } + if (value instanceof Map) { + return ((Map) value).isEmpty(); + } + if (value instanceof List) { + return ((List) value).isEmpty(); + } + return false; + } + + /** + * Extract the text value from a DOM node for XPath result processing. + * Handles different node types appropriately: + * - TEXT_NODE and CDATA_SECTION_NODE: returns node value directly + * - ATTRIBUTE_NODE: returns attribute value + * - ELEMENT_NODE: returns text content (concatenated text of all descendants) + * - Other node types: returns text content as fallback + * + * @param node the DOM node to extract text from + * @return the text content of the node, or null if node is null + */ + private String getNodeValue(Node node) { + if (node == null) { + return null; + } + + switch (node.getNodeType()) { + case Node.ATTRIBUTE_NODE: + case Node.CDATA_SECTION_NODE: + case Node.TEXT_NODE: + return node.getNodeValue(); + case Node.ELEMENT_NODE: + default: + return node.getTextContent(); + } + } + + /** + * Applies force_array logic to ensure all fields are arrays when enabled. + * + * Behavior: + * - If force_array is false: returns content unchanged + * - If force_array is true and content is already a List: returns content unchanged + * - If force_array is true and content is not a List: wraps content in a new ArrayList + * - Handles null content appropriately (wraps null in array if force_array is true) + * + * @param elementName the name of the element (for context, not used in current implementation) + * @param content the content to potentially wrap in an array + * @return the content, optionally wrapped in an array based on force_array setting + */ + private Object applyForceArray(String elementName, Object content) { + if (forceArray && !(content instanceof List)) { + List arrayContent = new ArrayList<>(); + arrayContent.add(content); // Add content even if it's null (for empty elements) + return arrayContent; + } + return content; + } + + /** + * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. + * + * Features: + * - Uses precompiled XPath expressions for optimal performance + * - Extracts text values from matched nodes (elements, attributes, text nodes) + * - Single matches stored as strings, multiple matches as string arrays + * - Respects ignoreFailure setting for XPath evaluation errors + * + * @param document the ingest document to add XPath results to + * @param doc the DOM document to evaluate XPath expressions against + * @throws Exception if XPath processing fails and ignoreFailure is false + */ + private void processXPathExpressionsFromDom(IngestDocument document, Document doc) throws Exception { + // Use precompiled XPath expressions for optimal performance + for (Map.Entry entry : compiledXPathExpressions.entrySet()) { + String targetFieldName = entry.getKey(); + XPathExpression compiledExpression = entry.getValue(); + + try { + Object result = compiledExpression.evaluate(doc, XPathConstants.NODESET); + + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + List values = new ArrayList<>(); + + for (int i = 0; i < nodeList.getLength(); i++) { + Node node = nodeList.item(i); + String value = getNodeValue(node); + if (value != null && value.trim().isEmpty() == false) { + values.add(value); + } + } + + if (values.isEmpty() == false) { + if (values.size() == 1) { + document.setFieldValue(targetFieldName, values.get(0)); + } else { + document.setFieldValue(targetFieldName, values); + } + } + } + } catch (XPathExpressionException e) { + if (ignoreFailure == false) { + throw new IllegalArgumentException( + "XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), + e + ); + } + } + } + } + + /** + * Compiles XPath expressions at processor creation time for optimal runtime performance. + * This method pre-compiles all configured XPath expressions with appropriate namespace context, + * eliminating the compilation overhead during document processing. + * + * @param xpathExpressions map of XPath expressions to target field names + * @param namespaces map of namespace prefixes to URIs + * @return map of compiled XPath expressions keyed by target field name + * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails + */ + private static Map compileXPathExpressions( + Map xpathExpressions, + Map namespaces + ) { + if (xpathExpressions.isEmpty()) { + return Map.of(); + } + + Map compiled = new HashMap<>(); + XPath xpath = XPATH_FACTORY.newXPath(); + + // Set namespace context if namespaces are defined + boolean hasNamespaces = namespaces.isEmpty() == false; + if (hasNamespaces) { + xpath.setNamespaceContext(new NamespaceContext() { + @Override + public String getNamespaceURI(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("Prefix cannot be null"); + } + return namespaces.getOrDefault(prefix, ""); + } + + @Override + public String getPrefix(String namespaceURI) { + for (Map.Entry entry : namespaces.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + return entry.getKey(); + } + } + return null; + } + + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = new ArrayList<>(); + for (Map.Entry entry : namespaces.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + prefixes.add(entry.getKey()); + } + } + return prefixes.iterator(); + } + }); + } + + // Use pre-compiled pattern to detect namespace prefixes + + for (Map.Entry entry : xpathExpressions.entrySet()) { + String xpathExpression = entry.getKey(); + String targetFieldName = entry.getValue(); + + // Validate namespace prefixes if no namespaces are configured + if (!hasNamespaces && NAMESPACE_PATTERN.matcher(xpathExpression).matches()) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + + xpathExpression + + "]: contains namespace prefixes but no namespace configuration provided" + ); + } + + try { + XPathExpression compiledExpression = xpath.compile(xpathExpression); + compiled.put(targetFieldName, compiledExpression); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException("Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e); + } + } + + return Map.copyOf(compiled); + } + + public static final class Factory implements Processor.Factory { + + @Override + public XmlProcessor create( + Map registry, + String processorTag, + String description, + Map config, + ProjectId projectId + ) throws Exception { + String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); + String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "target_field", field); + boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); + boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); + boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); + boolean ignoreEmptyValue = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_empty_value", false); + boolean storeXml = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "store_xml", true); + boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); + boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); + boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); + + // Parse XPath expressions map + Map xpathExpressions = new HashMap<>(); + Object xpathConfig = config.get("xpath"); + if (xpathConfig != null) { + if (xpathConfig instanceof Map) { + @SuppressWarnings("unchecked") + Map xpathMap = (Map) xpathConfig; + for (Map.Entry entry : xpathMap.entrySet()) { + if (entry.getValue() instanceof String) { + xpathExpressions.put(entry.getKey(), (String) entry.getValue()); + } else { + throw new IllegalArgumentException( + "XPath target field [" + + entry.getKey() + + "] must be a string, got [" + + entry.getValue().getClass().getSimpleName() + + "]" + ); + } + } + } else { + throw new IllegalArgumentException("XPath configuration must be a map of expressions to target fields"); + } + } + + // Parse namespaces map + Map namespaces = new HashMap<>(); + Object namespaceConfig = config.get("namespaces"); + if (namespaceConfig != null) { + if (namespaceConfig instanceof Map) { + @SuppressWarnings("unchecked") + Map namespaceMap = (Map) namespaceConfig; + for (Map.Entry entry : namespaceMap.entrySet()) { + if (entry.getValue() instanceof String) { + namespaces.put(entry.getKey(), (String) entry.getValue()); + } else { + throw new IllegalArgumentException( + "Namespace prefix [" + + entry.getKey() + + "] must have a string URI, got [" + + entry.getValue().getClass().getSimpleName() + + "]" + ); + } + } + } else { + throw new IllegalArgumentException("Namespaces configuration must be a map of prefixes to URIs"); + } + } + + // Parse parse_options parameter + String parseOptions = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "parse_options", ""); + if (parseOptions != null && !parseOptions.isEmpty() && !"strict".equals(parseOptions)) { + throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); + } + + return new XmlProcessor( + processorTag, + description, + field, + targetField, + ignoreMissing, + ignoreFailure, + toLower, + ignoreEmptyValue, + storeXml, + removeNamespaces, + forceContent, + forceArray, + xpathExpressions, + namespaces, + parseOptions + ); + } + } + + /** + * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. + * Uses streaming SAX parser with optional DOM building for XPath processing. + * + * @param document the ingest document to modify with parsed results + * @param xmlString the XML string to parse (should be trimmed) + * @throws Exception if XML parsing fails + */ + private void parseXmlAndXPath(IngestDocument document, String xmlString) throws Exception { + if (xmlString == null || xmlString.trim().isEmpty()) { + return; + } + + // Determine if we need DOM for XPath processing + boolean needsDom = xpathExpressions.isEmpty() == false; + + // Use the appropriate pre-configured SAX parser factory + javax.xml.parsers.SAXParserFactory factory = selectSaxParserFactory(); + + javax.xml.parsers.SAXParser parser = factory.newSAXParser(); + + // Configure error handler for strict mode + if (isStrict()) { + parser.getXMLReader().setErrorHandler(new org.xml.sax.ErrorHandler() { + @Override + public void warning(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + + @Override + public void error(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + + @Override + public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + }); + } + + // Use enhanced handler that can build DOM during streaming when needed + XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); + + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); + + // Store structured result if needed + if (storeXml) { + Object streamingResult = handler.getStructuredResult(); + if (streamingResult != null) { + document.setFieldValue(targetField, streamingResult); + } + } + + // Process XPath expressions if DOM was built during streaming + if (needsDom) { + Document domDocument = handler.getDomDocument(); + if (domDocument != null) { + processXPathExpressionsFromDom(document, domDocument); + } + } + } + + /** + * SAX ContentHandler that builds structured JSON output and optionally constructs a DOM tree during parsing. + * Handles XML-to-JSON conversion with support for all processor configuration options. + */ + private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHandler { + // Streaming parser state (for structured output) + private final java.util.Deque> elementStack = new java.util.ArrayDeque<>(); + private final java.util.Deque elementNameStack = new java.util.ArrayDeque<>(); + private final java.util.Deque textStack = new java.util.ArrayDeque<>(); + private final java.util.Deque>> repeatedElementsStack = new java.util.ArrayDeque<>(); + private Object rootResult = null; + + // DOM building state (for XPath processing when needed) + private final boolean buildDom; + private Document domDocument = null; + private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); + + public XmlStreamingWithDomHandler(boolean buildDom) { + this.buildDom = buildDom; + } + + @Override + public void startDocument() throws org.xml.sax.SAXException { + // Initialize DOM document if needed + if (buildDom) { + try { + // Use pre-configured secure DOM factory + // Since we build DOM programmatically (createElementNS/createElement), + // the factory's namespace awareness doesn't affect our usage + DocumentBuilder builder = DOM_FACTORY.newDocumentBuilder(); + domDocument = builder.newDocument(); + } catch (Exception e) { + throw new org.xml.sax.SAXException("Failed to create DOM document", e); + } + } + } + + @Override + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) + throws org.xml.sax.SAXException { + String elementName = getElementName(uri, localName, qName); + + // Build structured representation (always) + Map element = new HashMap<>(); + Map> repeatedElements = new HashMap<>(); + + // Process attributes for structured output + for (int i = 0; i < attributes.getLength(); i++) { + String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); + String attrValue = attributes.getValue(i); + + // Apply ignoreEmptyValue filtering to attributes + if (ignoreEmptyValue == false || isEmptyValue(attrValue) == false) { + element.put(attrName, attrValue); + } + } + + elementStack.push(element); + elementNameStack.push(elementName); + textStack.push(new StringBuilder()); + repeatedElementsStack.push(repeatedElements); + + // Build DOM element simultaneously if needed + if (buildDom && domDocument != null) { + org.w3c.dom.Element domElement; + if (uri != null && !uri.isEmpty() && !removeNamespaces) { + domElement = domDocument.createElementNS(uri, qName); + } else { + domElement = domDocument.createElement(removeNamespaces ? localName : qName); + } + + // Add attributes to DOM element + for (int i = 0; i < attributes.getLength(); i++) { + String attrUri = attributes.getURI(i); + String attrLocalName = attributes.getLocalName(i); + String attrQName = attributes.getQName(i); + String attrValue = attributes.getValue(i); + + if (attrUri != null && !attrUri.isEmpty() && !removeNamespaces) { + domElement.setAttributeNS(attrUri, attrQName, attrValue); + } else { + domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); + } + } + + // Add to parent or root + if (domElementStack.isEmpty()) { + domDocument.appendChild(domElement); + } else { + domElementStack.peek().appendChild(domElement); + } + + domElementStack.push(domElement); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { + // Add to structured output text accumulator + if (!textStack.isEmpty()) { + textStack.peek().append(ch, start, length); + } + + // Add to DOM text node if needed + if (buildDom && !domElementStack.isEmpty()) { + String text = new String(ch, start, length); + if (!text.trim().isEmpty() || !ignoreEmptyValue) { + org.w3c.dom.Text textNode = domDocument.createTextNode(text); + domElementStack.peek().appendChild(textNode); + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { + // Complete structured output element processing + if (elementStack.isEmpty()) { + return; + } + + Map element = elementStack.pop(); + String elementName = elementNameStack.pop(); + StringBuilder textContent = textStack.pop(); + Map> repeatedElements = repeatedElementsStack.pop(); + + // Add repeated elements as arrays + for (Map.Entry> entry : repeatedElements.entrySet()) { + List values = entry.getValue(); + if (ignoreEmptyValue == false || values.isEmpty() == false) { + element.put(entry.getKey(), values); + } + } + + // Process text content and determine final element structure + String trimmedText = textContent.toString().trim(); + boolean hasText = trimmedText.isEmpty() == false; + boolean hasChildren = element.size() > 0; + + Object elementValue; + if (hasText == false && hasChildren == false) { + // Empty element + if (ignoreEmptyValue == false) { + elementValue = applyForceArray(elementName, null); + } else { + elementValue = null; + } + } else if (hasText && hasChildren == false) { + // Only text content + if (forceContent) { + Map contentMap = new HashMap<>(); + if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + contentMap.put("#text", trimmedText); + } + elementValue = contentMap; + } else { + if (ignoreEmptyValue && isEmptyValue(trimmedText)) { + elementValue = null; + } else { + elementValue = trimmedText; + } + } + elementValue = applyForceArray(elementName, elementValue); + } else if (hasText == false && hasChildren) { + // Only child elements/attributes + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } else { + // Both text and children/attributes + if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + element.put("#text", trimmedText); + } + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } + + // If this is the root element, store the result + if (elementStack.isEmpty()) { + if (elementValue != null) { + Map result = new HashMap<>(); + result.put(elementName, elementValue); + rootResult = result; + } + } else { + // Add to parent element + if (elementValue != null) { + Map parentElement = elementStack.peek(); + Map> parentRepeatedElements = repeatedElementsStack.peek(); + + if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { + // Handle repeated elements + if (parentRepeatedElements.containsKey(elementName) == false) { + List list = new ArrayList<>(); + list.add(parentElement.get(elementName)); + parentRepeatedElements.put(elementName, list); + parentElement.remove(elementName); + } + parentRepeatedElements.get(elementName).add(elementValue); + } else { + // Apply force_array logic for single elements + Object finalContent = applyForceArray(elementName, elementValue); + parentElement.put(elementName, finalContent); + } + } + } + + // Complete DOM element if building DOM + if (buildDom && !domElementStack.isEmpty()) { + domElementStack.pop(); + } + } + + @Override + public void endDocument() throws org.xml.sax.SAXException { + // Document parsing complete + } + + public Object getStructuredResult() { + return rootResult; + } + + public Document getDomDocument() { + return domDocument; + } + + private String getElementName(String uri, String localName, String qName) { + String elementName; + if (removeNamespaces) { + elementName = localName != null && !localName.isEmpty() ? localName : qName; + } else { + elementName = qName; + } + + // Apply toLower if enabled + if (toLower) { + elementName = elementName.toLowerCase(Locale.ROOT); + } + + return elementName; + } + + private String getAttributeName(String uri, String localName, String qName) { + String attrName; + if (removeNamespaces) { + attrName = localName != null && !localName.isEmpty() ? localName : qName; + } else { + attrName = qName; + } + + // Apply toLower if enabled + if (toLower) { + attrName = attrName.toLowerCase(Locale.ROOT); + } + + return attrName; + } + } + + /** + * Creates a secure, pre-configured SAX parser factory for XML parsing. + * This factory is configured to prevent XXE attacks with SAX-specific features. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. + * This factory is configured to prevent XXE attacks and has namespace awareness enabled. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured SAX parser factory for strict XML parsing. + * This factory is configured to prevent XXE attacks and has strict validation enabled. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryStrict() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Enable strict parsing features + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. + * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Enable strict parsing features + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation. + * Since we only use this factory to create empty DOM documents programmatically + * (not to parse XML), XXE security features are not needed here. + * The SAX parser handles all XML parsing with appropriate security measures. + */ + private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); // Enable for maximum compatibility + factory.setValidating(false); + + // No XXE security features needed - we only create empty documents, + // never parse XML with this factory + + return factory; + } + + /** + * Selects the appropriate pre-configured SAX parser factory based on processor configuration. + * + * Factory selection matrix: + * - Regular parsing, no namespaces: SAX_PARSER_FACTORY + * - Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS + * - Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT + * - Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT + * + * @return the appropriate SAX parser factory for the current configuration + */ + private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { + boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; + if (isStrict()) { + return needsNamespaceAware ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; + } else { + return needsNamespaceAware ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; + } + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java new file mode 100644 index 0000000000000..e693c39a5cab3 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -0,0 +1,400 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + +public class XmlProcessorFactoryTests extends ESTestCase { + + private static final String DEFAULT_FIELD = "field1"; + private static final String DEFAULT_TARGET_FIELD = "target"; + + /** + * Creates a new XmlProcessor.Factory instance for testing. + */ + private XmlProcessor.Factory createFactory() { + return new XmlProcessor.Factory(); + } + + /** + * Creates a basic configuration map with the specified field name. + */ + private Map createBaseConfig(String fieldName) { + Map config = new HashMap<>(); + config.put("field", fieldName); + return config; + } + + /** + * Creates a basic configuration map with the default field name. + */ + private Map createBaseConfig() { + return createBaseConfig(DEFAULT_FIELD); + } + + /** + * Creates a configuration map with XPath expressions. + */ + private Map createConfigWithXPath(String fieldName, Map xpathExpressions) { + Map config = createBaseConfig(fieldName); + config.put("xpath", xpathExpressions); + return config; + } + + /** + * Creates a configuration map with namespace definitions. + */ + private Map createConfigWithNamespaces(String fieldName, Map namespaces) { + Map config = createBaseConfig(fieldName); + config.put("namespaces", namespaces); + return config; + } + + /** + * Creates a configuration map with both XPath expressions and namespaces. + */ + private Map createConfigWithXPathAndNamespaces( + String fieldName, + Map xpathExpressions, + Map namespaces + ) { + Map config = createBaseConfig(fieldName); + config.put("xpath", xpathExpressions); + config.put("namespaces", namespaces); + return config; + } + + /** + * Creates a processor with the given factory and configuration. + */ + private XmlProcessor createProcessor(XmlProcessor.Factory factory, Map config) throws Exception { + String processorTag = randomAlphaOfLength(10); + return factory.create(null, processorTag, null, config, null); + } + + /** + * Creates a processor with the default factory and given configuration. + */ + private XmlProcessor createProcessor(Map config) throws Exception { + return createProcessor(createFactory(), config); + } + + /** + * Helper method to create XPath configuration map. + */ + private Map createXPathConfig(String... expressionsAndFields) { + if (expressionsAndFields.length % 2 != 0) { + throw new IllegalArgumentException("Must provide even number of arguments (expression, field, expression, field, ...)"); + } + + Map xpathConfig = new HashMap<>(); + for (int i = 0; i < expressionsAndFields.length; i += 2) { + xpathConfig.put(expressionsAndFields[i], expressionsAndFields[i + 1]); + } + return xpathConfig; + } + + /** + * Helper method to create namespace configuration map. + */ + private Map createNamespaceConfig(String... prefixesAndUris) { + if (prefixesAndUris.length % 2 != 0) { + throw new IllegalArgumentException("Must provide even number of arguments (prefix, uri, prefix, uri, ...)"); + } + + Map namespaceConfig = new HashMap<>(); + for (int i = 0; i < prefixesAndUris.length; i += 2) { + namespaceConfig.put(prefixesAndUris[i], prefixesAndUris[i + 1]); + } + return namespaceConfig; + } + + /** + * Helper method to create configuration with common boolean options. + */ + private Map createConfigWithOptions(String fieldName, String... options) { + Map config = createBaseConfig(fieldName); + + for (String option : options) { + switch (option) { + case "ignore_missing": + config.put("ignore_missing", true); + break; + case "ignore_failure": + config.put("ignore_failure", true); + break; + case "to_lower": + config.put("to_lower", true); + break; + case "ignore_empty_value": + config.put("ignore_empty_value", true); + break; + case "store_xml": + config.put("store_xml", false); // Test false case since default is true + break; + case "remove_namespaces": + config.put("remove_namespaces", true); + break; + case "force_content": + config.put("force_content", true); + break; + case "force_array": + config.put("force_array", true); + break; + case "strict": + config.put("parse_options", "strict"); + break; + default: + throw new IllegalArgumentException("Unknown option: " + option); + } + } + + return config; + } + + /** + * Helper to expect processor creation failure with specific message. + */ + private void expectCreationFailure(Map config, Class exceptionClass, String expectedMessage) { + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + Exception exception = expectThrows(exceptionClass, () -> factory.create(null, processorTag, null, config, null)); + assertThat(exception.getMessage(), equalTo(expectedMessage)); + } + + /** + * Tests processor creation with various configurations. + */ + public void testCreate() throws Exception { + Map config = createBaseConfig(); + config.put("target_field", DEFAULT_TARGET_FIELD); + config.put("ignore_missing", true); + config.put("ignore_failure", true); + config.put("to_lower", true); + config.put("ignore_empty_value", true); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_TARGET_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + } + + public void testCreateWithDefaults() throws Exception { + Map config = createBaseConfig(); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(false)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(false)); + } + + public void testCreateMissingField() throws Exception { + Map config = new HashMap<>(); // Empty config - no field specified + expectCreationFailure(config, ElasticsearchParseException.class, "[field] required property is missing"); + } + + public void testCreateWithIgnoreEmptyValueOnly() throws Exception { + Map config = createBaseConfig(); + config.put("ignore_empty_value", true); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default + } + + public void testCreateWithXPath() throws Exception { + Map xpathConfig = createXPathConfig("//author/text()", "author_field", "//title/@lang", "language_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + } + + public void testCreateWithInvalidXPathConfig() throws Exception { + Map config = createBaseConfig(); + config.put("xpath", "invalid_string"); // Should be a map + + expectCreationFailure(config, IllegalArgumentException.class, "XPath configuration must be a map of expressions to target fields"); + } + + public void testCreateWithInvalidXPathTargetField() throws Exception { + Map config = createBaseConfig(); + + Map xpathConfig = new HashMap<>(); + xpathConfig.put("//author/text()", 123); // Should be string + config.put("xpath", xpathConfig); + + expectCreationFailure( + config, + IllegalArgumentException.class, + "XPath target field [//author/text()] must be a string, got [Integer]" + ); + } + + public void testCreateWithNamespaces() throws Exception { + Map namespacesConfig = createNamespaceConfig( + "book", + "http://example.com/book", + "author", + "http://example.com/author" + ); + Map config = createConfigWithNamespaces(DEFAULT_FIELD, namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + public void testCreateWithInvalidNamespacesConfig() throws Exception { + Map config = createBaseConfig(); + config.put("namespaces", "invalid_string"); // Should be a map + + expectCreationFailure(config, IllegalArgumentException.class, "Namespaces configuration must be a map of prefixes to URIs"); + } + + public void testCreateWithInvalidNamespaceURI() throws Exception { + Map config = createBaseConfig(); + + Map namespacesConfig = new HashMap<>(); + namespacesConfig.put("book", 123); // Should be string + config.put("namespaces", namespacesConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "Namespace prefix [book] must have a string URI, got [Integer]"); + } + + public void testCreateWithXPathAndNamespaces() throws Exception { + Map xpathConfig = createXPathConfig("//book:author/text()", "author_field", "//book:title/@lang", "language_field"); + Map namespacesConfig = createNamespaceConfig("book", "http://example.com/book"); + Map config = createConfigWithXPathAndNamespaces(DEFAULT_FIELD, xpathConfig, namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + // Tests for individual boolean options + + public void testCreateWithStoreXmlFalse() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "store_xml"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isStoreXml(), equalTo(false)); + } + + public void testCreateWithRemoveNamespaces() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "remove_namespaces"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + public void testCreateWithForceContent() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "force_content"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceContent(), equalTo(true)); + } + + public void testCreateWithForceArray() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "force_array"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceArray(), equalTo(true)); + } + + public void testCreateWithStrictParseOptions() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "strict"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getParseOptions(), equalTo("strict")); + assertThat(processor.isStrict(), equalTo(true)); + } + + public void testCreateWithMultipleOptions() throws Exception { + Map config = createConfigWithOptions( + DEFAULT_FIELD, + "ignore_missing", + "force_content", + "force_array", + "remove_namespaces" + ); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isForceContent(), equalTo(true)); + assertThat(processor.isForceArray(), equalTo(true)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + // Tests for invalid parse options + + public void testCreateWithInvalidParseOptions() throws Exception { + Map config = createBaseConfig(); + config.put("parse_options", "invalid_option"); + + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid parse_options [invalid_option]. Only 'strict' is supported." + ); + } + + // Tests for XPath compilation errors (testing precompilation feature) + + public void testCreateWithInvalidXPathExpression() throws Exception { + Map xpathConfig = createXPathConfig("invalid xpath ][", "target_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> factory.create(null, processorTag, null, config, null) + ); + + // Check that the error message contains the XPath expression and indicates it's invalid + assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]:")); + assertThat(exception.getMessage(), containsString("javax.xml.transform.TransformerException")); + } + + public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exception { + Map xpathConfig = createXPathConfig("//book:title/text()", "title_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided" + ); + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java new file mode 100644 index 0000000000000..c01c086be1eb5 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -0,0 +1,635 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; + +/** + * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. + */ +@SuppressWarnings("unchecked") +public class XmlProcessorTests extends ESTestCase { + + private static final String XML_FIELD = "xmldata"; + private static final String TARGET_FIELD = "data"; + + private static IngestDocument createTestIngestDocument(String xml) { + return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); + } + + private static XmlProcessor createTestProcessor(Map config) { + config.putIfAbsent("field", XML_FIELD); + config.putIfAbsent("target_field", TARGET_FIELD); + + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + try { + return factory.create(null, "_tag", null, config, null); + } catch (Exception e) { + fail("Failed to create XmlProcessor: " + e.getMessage()); + return null; // This line will never be reached, but is needed to satisfy the compiler + } + } + + /** + * Test parsing standard XML with attributes. + */ + public void testParseStandardXml() { + String xml = ""; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("key"), equalTo("value")); + } + + /** + * Test parsing XML with array elements (multiple elements with same name). + */ + public void testParseXmlWithArrayValue() { + String xml = "value1value2"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + List keyValues = (List) foo.get("key"); + assertThat(keyValues.size(), equalTo(2)); + + // The values might be nested inside their own lists + Object firstValue = keyValues.get(0); + assertThat(firstValue, equalTo("value1")); + + Object secondValue = keyValues.get(1); + assertThat(secondValue, equalTo("value2")); + } + + /** + * Test parsing XML with nested elements. + */ + public void testParseXmlWithNestedElements() { + String xml = "value"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + Map key1Map = (Map) foo.get("key1"); + assertThat(key1Map.size(), equalTo(1)); + + String key2Value = (String) key1Map.get("key2"); + assertThat(key2Value, equalTo("value")); + } + + /** + * Test parsing XML in a single item array. + */ + public void testParseXmlInSingleItemArray() { + String xml = ""; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("bar"), equalTo("baz")); + } + + /** + * Test extracting a single element using XPath. + */ + public void testXPathSingleElementExtraction() { + String xml = "helloworld"; + + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Get the XPath result + Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); + assertNotNull(barContent); + assertEquals("hello", barContent); + + // Verify that the full parsed XML is also available + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertNotNull(foo); + assertThat(foo.get("bar"), equalTo("hello")); + assertThat(foo.get("baz"), equalTo("world")); + } + + /** + * Test extracting multiple elements using XPath. + */ + public void testXPathMultipleElementsExtraction() { + String xml = "firstsecondthird"; + + Map xpathMap = Map.of("/foo/bar", "all_bars"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + List allBars = ingestDocument.getFieldValue("all_bars", List.class); + + assertNotNull(allBars); + assertThat(allBars.size(), equalTo(3)); + assertThat(allBars.get(0), equalTo("first")); + assertThat(allBars.get(1), equalTo("second")); + assertThat(allBars.get(2), equalTo("third")); + } + + /** + * Test extracting attributes using XPath. + */ + public void testXPathAttributeExtraction() { + String xml = "content"; + + Map xpathMap = new HashMap<>(); + xpathMap.put("/foo/bar/@id", "bar_id"); + xpathMap.put("/foo/bar/@type", "bar_type"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + String barId = ingestDocument.getFieldValue("bar_id", String.class); + assertNotNull(barId); + assertThat(barId, equalTo("123")); + + String barType = ingestDocument.getFieldValue("bar_type", String.class); + assertNotNull(barType); + assertThat(barType, equalTo("test")); + } + + /** + * Test extracting elements with namespaces using XPath. + */ + public void testXPathNamespacedExtraction() { + String xml = "" + + "" + + " namespace-value" + + " regular-value" + + ""; + + Map namespaces = Map.of("myns", "http://example.org/ns1"); + Map xpathMap = Map.of("//myns:element/text()", "ns_value"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + config.put("namespaces", namespaces); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + String nsValue = ingestDocument.getFieldValue("ns_value", String.class); + assertNotNull(nsValue); + assertThat(nsValue, equalTo("namespace-value")); + } + + /** + * Test parsing XML with mixed content (text and elements mixed together). + */ + public void testParseXmlWithMixedContent() { + String xml = "This text is bold and this is italic!"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + assertNotNull(foo.get("b")); + assertThat((String) foo.get("b"), equalTo("bold")); + assertNotNull(foo.get("i")); + assertThat((String) foo.get("i"), equalTo("italic")); + assertNotNull(foo.get("#text")); + assertThat((String) foo.get("#text"), equalTo("This text is and this is !")); + } + + /** + * Test parsing XML with CDATA sections. + */ + public void testParseXmlWithCDATA() { + String xml = " that shouldn't be parsed!]]>"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Object content = data.get("foo"); + + assertNotNull(content); + assertThat(content, equalTo("This is CDATA content with that shouldn't be parsed!")); + } + + /** + * Test parsing XML with numeric data. + */ + public void testParseXmlWithNumericData() { + String xml = "12399.95true"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + assertThat((String) foo.get("count"), equalTo("123")); + assertThat((String) foo.get("price"), equalTo("99.95")); + assertThat((String) foo.get("active"), equalTo("true")); + } + + /** + * Test parsing XML with force_array option enabled. + */ + public void testParseXmlWithForceArray() { + String xml = "single_value"; + + Map config = new HashMap<>(); + config.put("force_array", true); // Enable force_array option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + // With force_array=true, even single values should be in arrays + Object barValue = foo.get("bar"); + assertNotNull(barValue); + assertTrue("Expected bar value to be a List with force_array=true", barValue instanceof List); + + List barList = (List) barValue; + assertThat(barList.size(), equalTo(1)); + assertThat(barList.get(0), equalTo("single_value")); + } + + /** + * Test extracting multiple elements using multiple XPath expressions. + * Tests that multiple XPath expressions can be used simultaneously. + */ + public void testMultipleXPathExpressions() { + String xml = "" + + " John30" + + " Jane25" + + ""; + + // Configure multiple XPath expressions + Map xpathMap = new HashMap<>(); + xpathMap.put("/root/person[1]/n/text()", "first_person_name"); + xpathMap.put("/root/person[2]/n/text()", "second_person_name"); + xpathMap.put("/root/person/@id", "person_ids"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + assertTrue("first_person_name field should exist", ingestDocument.hasField("first_person_name")); + assertTrue("second_person_name field should exist", ingestDocument.hasField("second_person_name")); + assertTrue("person_ids field should exist", ingestDocument.hasField("person_ids")); + + Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); + assertEquals("John", firstName); + + Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); + assertEquals("Jane", secondName); + + Object personIdsObj = ingestDocument.getFieldValue("person_ids", Object.class); + assertTrue("person_ids should be a List", personIdsObj instanceof List); + List personIds = (List) personIdsObj; + assertEquals("Should have 2 person IDs", 2, personIds.size()); + assertEquals("First person ID should be '1'", "1", personIds.get(0)); + assertEquals("Second person ID should be '2'", "2", personIds.get(1)); + + assertTrue("Target field should exist", ingestDocument.hasField(TARGET_FIELD)); + } + + /** + * Test handling of invalid XML with ignoreFailure=false. + */ + public void testInvalidXml() { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(ingestDocument); }); + + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); + } + + /** + * Test handling of invalid XML with ignoreFailure=true. + */ + public void testInvalidXmlWithIgnoreFailure() { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + config.put("ignore_failure", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + List tags = ingestDocument.getFieldValue("tags", List.class); + assertNotNull(tags); + assertTrue(tags.contains("_xmlparsefailure")); + } + + /** + * Test the store_xml=false option to not store parsed XML in target field. + */ + public void testNoStoreXml() { + String xml = "value"; + + // Set up XPath to extract value but don't store XML + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("store_xml", false); // Do not store XML in target field + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Verify XPath result is stored + String barContent = ingestDocument.getFieldValue("bar_content", String.class); + assertNotNull(barContent); + assertThat(barContent, equalTo("value")); + + // Verify the target field was not created + assertFalse(ingestDocument.hasField(TARGET_FIELD)); + } + + /** + * Test the to_lower option for converting field names to lowercase. + */ + public void testToLower() { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("to_lower", true); // Enable to_lower option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + // Verify field names are lowercase + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + assertTrue(data.containsKey("foo")); + assertFalse(data.containsKey("FOO")); + + Map foo = (Map) data.get("foo"); + assertTrue(foo.containsKey("bar")); + assertFalse(foo.containsKey("BAR")); + assertThat(foo.get("bar"), equalTo("value")); + } + + /** + * Test the ignore_missing option when field is missing. + */ + public void testIgnoreMissing() { + String xmlField = "nonexistent_field"; + + Map config = new HashMap<>(); + config.put("field", xmlField); + config.put("ignore_missing", true); // Enable ignore_missing option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); + processor.execute(ingestDocument); + + assertFalse("Target field should not be created when source field is missing", ingestDocument.hasField(TARGET_FIELD)); + + // With ignoreMissing=false + config.put("ignore_missing", false); + XmlProcessor failingProcessor = createTestProcessor(config); + + // This should throw an exception + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> { failingProcessor.execute(ingestDocument); } + ); + + assertTrue(exception.getMessage().contains("not present as part of path")); + } + + /** + * Test that ignore_empty_value correctly filters out empty values from arrays and mixed content. + */ + public void testIgnoreEmptyValue() { + // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values + String xml = "" + + " " + + " " + + " content" + + " nested-content" + + " " + + " first" + + " " + + " third" + + " " + + " fifth" + + " " + + " Text with and content" + + ""; + + Map config = new HashMap<>(); + config.put("ignore_empty_value", true); + XmlProcessor processor = createTestProcessor(config); + + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); + + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map root = (Map) result.get("root"); + + // Check empty elements are filtered + assertFalse("Empty element should be filtered out", root.containsKey("empty")); + assertFalse("Blank element should be filtered out", root.containsKey("blank")); + + // Check valid elements are preserved + assertTrue("Valid element should be preserved", root.containsKey("valid")); + assertEquals("content", root.get("valid")); + + // Check nested structure filtering + Map nested = (Map) root.get("nested"); + assertNotNull("Nested element should be preserved", nested); + assertFalse("Empty nested element should be filtered", nested.containsKey("empty")); + assertEquals("nested-content", nested.get("valid")); + + // Check array with mixed empty/non-empty values + Map items = (Map) root.get("items"); + assertNotNull("Items object should be preserved", items); + List itemList = (List) items.get("item"); + assertNotNull("Item array should be preserved", itemList); + assertEquals("Array should contain only non-empty items", 3, itemList.size()); + assertEquals("first", itemList.get(0)); + assertEquals("third", itemList.get(1)); + assertEquals("fifth", itemList.get(2)); + + // Check mixed content handling + Map mixed = (Map) root.get("mixed"); + assertNotNull("Mixed content should be preserved", mixed); + assertFalse("Empty element in mixed content should be filtered", mixed.containsKey("empty")); + assertTrue("Valid element in mixed content should be preserved", mixed.containsKey("valid")); + assertEquals("content", mixed.get("valid")); + assertEquals("Text with and", mixed.get("#text")); + } + + /** + * Test parsing with strict mode option. + */ + public void testStrictParsing() { + String xml = "valid"; + + Map config = new HashMap<>(); + config.put("parse_options", "strict"); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("bar"), equalTo("valid")); + + // Test with invalid XML in strict mode + String invalidXml = ""; + IngestDocument invalidDocument = createTestIngestDocument(invalidXml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(invalidDocument); }); + + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); + } + + /** + * Test parsing XML with remove_namespaces option. + */ + public void testRemoveNamespaces() { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("remove_namespaces", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + + assertTrue("Element without namespace should be present", foo.containsKey("bar")); + assertThat(foo.get("bar"), equalTo("value")); + + // Now test with removeNamespaces=false + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + config.put("remove_namespaces", false); + XmlProcessor processor2 = createTestProcessor(config); + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + Map foo2 = (Map) data2.get("foo"); + + // With removeNamespaces=false, the "ns:" prefix should be preserved + assertTrue("Element should be accessible with namespace prefix", foo2.containsKey("ns:bar")); + assertThat(foo2.get("ns:bar"), equalTo("value")); + } + + /** + * Test the force_content option. + */ + public void testForceContent() { + String xml = "simple text"; + + Map config = new HashMap<>(); + config.put("force_content", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + + // With forceContent=true, the text should be in a #text field + assertTrue("Text content should be in #text field", foo.containsKey("#text")); + assertThat(foo.get("#text"), equalTo("simple text")); + + // Now test with forceContent=false + config.put("force_content", false); + XmlProcessor processor2 = createTestProcessor(config); + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + + // With forceContent=false, the text should be directly assigned to the element + assertThat(data2.get("foo"), equalTo("simple text")); + } +}