From 67dd26466f7d4fb6ac4da203f4f4006f03d1ea2a Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 30 Jun 2025 16:12:04 +0200 Subject: [PATCH 1/9] Add XmlProcessor initial implementation --- docs/reference/enrich-processor/index.md | 3 + docs/reference/enrich-processor/toc.yml | 1 + .../enrich-processor/xml-processor.md | 281 ++++++++ .../src/main/java/module-info.java | 2 + .../ingest/common/IngestCommonPlugin.java | 3 +- .../ingest/common/XmlProcessor.java | 340 ++++++++++ .../common/XmlProcessorFactoryTests.java | 82 +++ .../ingest/common/XmlProcessorTests.java | 626 ++++++++++++++++++ 8 files changed, 1337 insertions(+), 1 deletion(-) create mode 100644 docs/reference/enrich-processor/xml-processor.md create mode 100644 modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java diff --git a/docs/reference/enrich-processor/index.md b/docs/reference/enrich-processor/index.md index e220e763024e3..fb2cac99ee355 100644 --- a/docs/reference/enrich-processor/index.md +++ b/docs/reference/enrich-processor/index.md @@ -159,6 +159,9 @@ Refer to [Enrich your data](docs-content://manage-data/ingest/transform-enrich/d [`split` processor](/reference/enrich-processor/split-processor.md) : Splits a field into an array of values. +[`xml` processor](/reference/enrich-processor/xml-processor.md) +: Parses XML documents and converts them to JSON objects. + [`trim` processor](/reference/enrich-processor/trim-processor.md) : Trims whitespace from field. diff --git a/docs/reference/enrich-processor/toc.yml b/docs/reference/enrich-processor/toc.yml index 7da271e6f0554..370c020f5f393 100644 --- a/docs/reference/enrich-processor/toc.yml +++ b/docs/reference/enrich-processor/toc.yml @@ -46,3 +46,4 @@ toc: - file: urldecode-processor.md - file: uri-parts-processor.md - file: user-agent-processor.md + - file: xml-processor.md diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md new file mode 100644 index 0000000000000..0621a66d8edc6 --- /dev/null +++ b/docs/reference/enrich-processor/xml-processor.md @@ -0,0 +1,281 @@ +--- +navigation_title: "XML" +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/reference/current/xml-processor.html +--- + +# XML processor [xml-processor] + + +Parses XML documents and converts them to JSON objects using a streaming XML parser. This processor efficiently handles XML data by avoiding loading the entire document into memory. + +$$$xml-options$$$ + +| Name | Required | Default | Description | +| --- | --- | --- | --- | +| `field` | yes | - | The field containing the XML string to be parsed. | +| `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | +| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | +| `ignore_failure` | no | `false` | Ignore failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `to_lower` | no | `false` | Convert XML element names to lowercase. | +| `ignore_empty_value` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | +| `if` | no | - | Conditionally execute the processor. See [Conditionally run a processor](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#conditionally-run-processor). | +| `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `tag` | no | - | Identifier for the processor. Useful for debugging and metrics. | + +## Configuration + +```js +{ + "xml": { + "field": "xml_field", + "target_field": "parsed_xml", + "ignore_empty_value": true + } +} +``` + +## Examples + +### Basic XML parsing + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels." + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels.", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions", + "review": "One of the great seminal American novels." + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### Filtering empty values + +When `ignore_empty_value` is set to `true`, the processor will remove empty elements from the parsed XML: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "target_field": "parsed_xml", + "ignore_empty_value": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content" + } + } + ] +} +``` + +Result with empty elements filtered out: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "William H. GaddisOne of the great seminal American novels. Some content", + "parsed_xml": { + "catalog": { + "book": { + "author": "William H. Gaddis", + "review": "One of the great seminal American novels.", + "nested": { + "valid_content": "Some content" + } + } + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### Converting element names to lowercase + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "to_lower": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "William H. GaddisThe Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "William H. GaddisThe Recognitions", + "catalog": { + "book": { + "author": "William H. Gaddis", + "title": "The Recognitions" + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### Handling XML attributes + +XML attributes are included as properties in the resulting JSON object alongside element content: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + "_index": "_index", + "_id": "_id", + "_version": "-3", + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis", + "catalog": { + "version": "1.0", + "book": { + "id": "123", + "isbn": "978-0-684-80335-9", + "title": { + "lang": "en", + "#text": "The Recognitions" + }, + "author": { + "nationality": "American", + "#text": "William H. Gaddis" + } + } + } + }, + "_ingest": { + "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +## XML features + +The XML processor supports: + +- **Elements with text content**: Converted to key-value pairs where the element name is the key and text content is the value +- **Nested elements**: Converted to nested JSON objects +- **Empty elements**: Converted to `null` values (can be filtered with `ignore_empty_value`) +- **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level +- **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key +- **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties +- **Namespaces**: Local names are used, namespace prefixes are ignored diff --git a/modules/ingest-common/src/main/java/module-info.java b/modules/ingest-common/src/main/java/module-info.java index c3b3ab90892d9..ee98b7515e733 100644 --- a/modules/ingest-common/src/main/java/module-info.java +++ b/modules/ingest-common/src/main/java/module-info.java @@ -19,6 +19,8 @@ requires org.apache.logging.log4j; requires org.apache.lucene.analysis.common; requires org.jruby.joni; + + requires java.xml; exports org.elasticsearch.ingest.common; // for painless diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java index 6e517d644cadb..0dc06e74af3bc 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java @@ -74,7 +74,8 @@ public Map getProcessors(Processor.Parameters paramet entry(TrimProcessor.TYPE, new TrimProcessor.Factory()), entry(URLDecodeProcessor.TYPE, new URLDecodeProcessor.Factory()), entry(UppercaseProcessor.TYPE, new UppercaseProcessor.Factory()), - entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()) + entry(UriPartsProcessor.TYPE, new UriPartsProcessor.Factory()), + entry(XmlProcessor.TYPE, new XmlProcessor.Factory()) ); } diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java new file mode 100644 index 0000000000000..897d36d7084b7 --- /dev/null +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -0,0 +1,340 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.ingest.AbstractProcessor; +import org.elasticsearch.ingest.ConfigurationUtils; +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.Processor; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +/** + * Processor that parses XML documents and converts them to JSON objects using streaming XML parser. + * This implementation uses XMLStreamReader for efficient parsing and avoids loading the entire document in memory. + */ +public final class XmlProcessor extends AbstractProcessor { + + public static final String TYPE = "xml"; + + private final String field; + private final String targetField; + private final boolean ignoreMissing; + private final boolean ignoreFailure; + private final boolean toLower; + private final boolean ignoreEmptyValue; + + XmlProcessor( + String tag, + String description, + String field, + String targetField, + boolean ignoreMissing, + boolean ignoreFailure, + boolean toLower, + boolean ignoreEmptyValue + ) { + super(tag, description); + this.field = field; + this.targetField = targetField; + this.ignoreMissing = ignoreMissing; + this.ignoreFailure = ignoreFailure; + this.toLower = toLower; + this.ignoreEmptyValue = ignoreEmptyValue; + } + + public String getField() { + return field; + } + + public String getTargetField() { + return targetField; + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public boolean isIgnoreEmptyValue() { + return ignoreEmptyValue; + } + + @Override + public IngestDocument execute(IngestDocument document) { + Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); + + if (fieldValue == null && ignoreMissing) { + return document; + } else if (fieldValue == null) { + throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); + } + + if (fieldValue instanceof String == false) { + if (ignoreFailure) { + return document; + } + throw new IllegalArgumentException("field [" + field + "] is not a string, cannot parse XML"); + } + + String xmlString = (String) fieldValue; + try { + Object parsedXml = parseXml(xmlString.trim()); + if (ignoreEmptyValue) { + parsedXml = filterEmptyValues(parsedXml); + } + document.setFieldValue(targetField, parsedXml); + } catch (Exception e) { + if (ignoreFailure) { + return document; + } + throw new IllegalArgumentException("field [" + field + "] contains invalid XML: " + e.getMessage(), e); + } + + return document; + } + + @Override + public String getType() { + return TYPE; + } + + /** + * Recursively removes null and empty values from the parsed XML structure + * when ignoreEmptyValue is enabled. + */ + @SuppressWarnings("unchecked") + private Object filterEmptyValues(Object obj) { + if (obj == null) { + return null; + } + + if (obj instanceof Map) { + Map map = (Map) obj; + Map filtered = new HashMap<>(); + + for (Map.Entry entry : map.entrySet()) { + Object filteredValue = filterEmptyValues(entry.getValue()); + if (filteredValue != null && isEmptyValue(filteredValue) == false) { + filtered.put(entry.getKey(), filteredValue); + } + } + + return filtered.isEmpty() ? null : filtered; + } + + if (obj instanceof List) { + List list = (List) obj; + List filtered = new ArrayList<>(); + + for (Object item : list) { + Object filteredItem = filterEmptyValues(item); + if (filteredItem != null && isEmptyValue(filteredItem) == false) { + filtered.add(filteredItem); + } + } + + return filtered.isEmpty() ? null : filtered; + } + + return isEmptyValue(obj) ? null : obj; + } + + /** + * Determines if a value should be considered empty for filtering purposes. + */ + private boolean isEmptyValue(Object value) { + if (value == null) { + return true; + } + if (value instanceof String) { + return ((String) value).trim().isEmpty(); + } + if (value instanceof Map) { + return ((Map) value).isEmpty(); + } + if (value instanceof List) { + return ((List) value).isEmpty(); + } + return false; + } + + private Object parseXml(String xmlString) throws XMLStreamException { + if (xmlString == null || xmlString.trim().isEmpty()) { + return null; + } + + XMLInputFactory factory = XMLInputFactory.newInstance(); + factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); + factory.setProperty(XMLInputFactory.IS_COALESCING, true); + factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + + try (StringReader reader = new StringReader(xmlString)) { + XMLStreamReader xmlReader = factory.createXMLStreamReader(reader); + + // Skip to the first element + while (xmlReader.hasNext() && xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) { + xmlReader.next(); + } + + if (xmlReader.hasNext() == false) { + return null; + } + + Object result = parseElement(xmlReader); + xmlReader.close(); + return result; + } + } + + private Object parseElement(XMLStreamReader reader) throws XMLStreamException { + if (reader.getEventType() != XMLStreamConstants.START_ELEMENT) { + return null; + } + + String elementName = reader.getLocalName(); + if (toLower) { + elementName = elementName.toLowerCase(Locale.ROOT); + } + + Map element = new HashMap<>(); + Map> repeatedElements = new HashMap<>(); + + // Parse attributes - they are available in START_ELEMENT state + int attributeCount = reader.getAttributeCount(); + boolean hasAttributes = attributeCount > 0; + for (int i = 0; i < attributeCount; i++) { + String attrName = reader.getAttributeLocalName(i); + String attrValue = reader.getAttributeValue(i); + if (toLower) { + attrName = attrName.toLowerCase(Locale.ROOT); + } + element.put(attrName, attrValue); + } + + StringBuilder textContent = new StringBuilder(); + + while (reader.hasNext()) { + int eventType = reader.next(); + + switch (eventType) { + case XMLStreamConstants.START_ELEMENT: + Object childElementResult = parseElement(reader); + String childName = reader.getLocalName(); + if (toLower) { + childName = childName.toLowerCase(Locale.ROOT); + } + + // Extract the actual content from the child element result + Object childContent = null; + if (childElementResult instanceof Map) { + @SuppressWarnings("unchecked") + Map childMap = (Map) childElementResult; + // The child element returns {elementName: content}, we want just the content + childContent = childMap.get(childName); + } else { + childContent = childElementResult; + } + + if (element.containsKey(childName) || repeatedElements.containsKey(childName)) { + // Handle repeated elements + if (repeatedElements.containsKey(childName) == false) { + List list = new ArrayList<>(); + list.add(element.get(childName)); + repeatedElements.put(childName, list); + element.remove(childName); + } + repeatedElements.get(childName).add(childContent); + } else { + element.put(childName, childContent); + } + break; + + case XMLStreamConstants.CHARACTERS: + String text = reader.getText(); + if (text != null && text.trim().isEmpty() == false) { + textContent.append(text); + } + break; + + case XMLStreamConstants.END_ELEMENT: + // Add repeated elements as arrays + for (Map.Entry> entry : repeatedElements.entrySet()) { + element.put(entry.getKey(), entry.getValue()); + } + + // Determine what to return + String trimmedText = textContent.toString().trim(); + boolean hasText = trimmedText.isEmpty() == false; + boolean hasChildren = element.size() > attributeCount; // Children beyond attributes + + Map result = new HashMap<>(); + if (hasText == false && hasChildren == false && hasAttributes == false) { + // Empty element + result.put(elementName, null); + return result; + } else if (hasText && hasChildren == false) { + // Only text content (and possibly attributes) + if (hasAttributes) { + element.put("#text", trimmedText); + result.put(elementName, element); + return result; + } else { + result.put(elementName, trimmedText); + return result; + } + } else if (hasText == false && hasChildren) { + // Only child elements (and possibly attributes) + result.put(elementName, element); + return result; + } else { + // Both text and children (and possibly attributes) + element.put("#text", trimmedText); + result.put(elementName, element); + return result; + } + } + } + + return null; + } + + public static final class Factory implements Processor.Factory { + + @Override + public XmlProcessor create( + Map registry, + String processorTag, + String description, + Map config, + ProjectId projectId + ) throws Exception { + String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); + String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "target_field", field); + boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); + boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); + boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); + boolean ignoreEmptyValue = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_empty_value", false); + + return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue); + } + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java new file mode 100644 index 0000000000000..ba1ba839a9edc --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -0,0 +1,82 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; + +public class XmlProcessorFactoryTests extends ESTestCase { + + public void testCreate() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", "field1"); + config.put("target_field", "target"); + config.put("ignore_missing", true); + config.put("ignore_failure", true); + config.put("to_lower", true); + config.put("ignore_empty_value", true); + + String processorTag = randomAlphaOfLength(10); + XmlProcessor processor = factory.create(null, processorTag, null, config, null); + + assertThat(processor.getTag(), equalTo(processorTag)); + assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + } + + public void testCreateWithDefaults() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", "field1"); + + String processorTag = randomAlphaOfLength(10); + XmlProcessor processor = factory.create(null, processorTag, null, config, null); + + assertThat(processor.getTag(), equalTo(processorTag)); + assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.getTargetField(), equalTo("field1")); + assertThat(processor.isIgnoreMissing(), equalTo(false)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(false)); + } + + public void testCreateMissingField() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + + String processorTag = randomAlphaOfLength(10); + ElasticsearchParseException exception = expectThrows( + ElasticsearchParseException.class, + () -> factory.create(null, processorTag, null, config, null) + ); + assertThat(exception.getMessage(), equalTo("[field] required property is missing")); + } + + public void testCreateWithIgnoreEmptyValueOnly() throws Exception { + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + Map config = new HashMap<>(); + config.put("field", "field1"); + config.put("ignore_empty_value", true); + + String processorTag = randomAlphaOfLength(10); + XmlProcessor processor = factory.create(null, processorTag, null, config, null); + + assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default + } +} diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java new file mode 100644 index 0000000000000..0b1ee4123f128 --- /dev/null +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -0,0 +1,626 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.ingest.common; + +import org.elasticsearch.ingest.IngestDocument; +import org.elasticsearch.ingest.RandomDocumentPicks; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + +public class XmlProcessorTests extends ESTestCase { + + public void testSimpleXmlDecodeWithTargetField() throws Exception { + String xml = """ + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "xml", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map xmlField = (Map) ingestDocument.getFieldValue("xml", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) xmlField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + assertThat(book.get("seq"), equalTo("1")); + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("title"), equalTo("The Recognitions")); + assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); + + // Original field should remain unchanged + assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(xml)); + } + + public void testXmlDecodeToSameFieldWhenTargetIsField() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + assertThat(book.get("seq"), equalTo("1")); + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("title"), equalTo("The Recognitions")); + assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); + } + + public void testXmlDecodeWithArray() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + List> books = (List>) catalog.get("book"); + + assertThat(books.size(), equalTo(2)); + + Map firstBook = books.get(0); + assertThat(firstBook.get("author"), equalTo("William H. Gaddis")); + assertThat(firstBook.get("title"), equalTo("The Recognitions")); + + Map secondBook = books.get(1); + assertThat(secondBook.get("author"), equalTo("Ralls, Kim")); + assertThat(secondBook.get("title"), equalTo("Midnight Rain")); + } + + public void testXmlDecodeWithToLower() throws Exception { + String xml = """ + + + + N/A + + + N/A + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, true, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map auditbase = (Map) messageField.get("auditbase"); + @SuppressWarnings("unchecked") + Map contextcomponents = (Map) auditbase.get("contextcomponents"); + @SuppressWarnings("unchecked") + List> components = (List>) contextcomponents.get("component"); + + assertThat(components.size(), equalTo(2)); + assertThat(components.get(0).get("relyingparty"), equalTo("N/A")); + assertThat(components.get(1).get("primaryauth"), equalTo("N/A")); + } + + public void testXmlDecodeWithMultipleElements() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + + + Ralls, Kim + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen of the world. + + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + + // Check books array + @SuppressWarnings("unchecked") + List> books = (List>) catalog.get("book"); + assertThat(books.size(), equalTo(2)); + + // Check secondcategory + @SuppressWarnings("unchecked") + Map secondcategory = (Map) catalog.get("secondcategory"); + @SuppressWarnings("unchecked") + Map paper = (Map) secondcategory.get("paper"); + assertThat(paper.get("id"), equalTo("bk102")); + assertThat(paper.get("test2"), equalTo("Ralls, Kim")); + } + + public void testXmlDecodeWithUtf16Encoding() throws Exception { + String xml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("title"), equalTo("The Recognitions")); + } + + public void testBrokenXmlWithIgnoreFailureFalse() { + String brokenXml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>"""; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", brokenXml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("contains invalid XML")); + } + + public void testBrokenXmlWithIgnoreFailureTrue() throws Exception { + String brokenXml = """ + + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>"""; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, true, false, false); + Map document = new HashMap<>(); + document.put("message", brokenXml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + // Should not throw exception and leave document unchanged + processor.execute(ingestDocument); + assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(brokenXml)); + } + + public void testFieldNotFound() { + XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", false, false, false, false); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("not present as part of path [nonexistent]")); + } + + public void testFieldNotFoundWithIgnoreMissing() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", true, false, false, false); + IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + IngestDocument ingestDocument = new IngestDocument(originalDocument); + + processor.execute(ingestDocument); + + // Document should remain unchanged + assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + } + + public void testNullValue() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", null); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("field [field] is null")); + } + + public void testNullValueWithIgnoreMissing() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, false, false, false); + Map document = new HashMap<>(); + document.put("field", null); + IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + IngestDocument ingestDocument = new IngestDocument(originalDocument); + + processor.execute(ingestDocument); + + // Document should remain unchanged + assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + } + + public void testNonStringValueWithIgnoreFailureFalse() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", 123); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); + assertThat(exception.getMessage(), containsString("field [field] is not a string")); + } + + public void testNonStringValueWithIgnoreFailureTrue() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, true, false, false); + Map document = new HashMap<>(); + document.put("field", 123); + IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + IngestDocument ingestDocument = new IngestDocument(originalDocument); + + processor.execute(ingestDocument); + + // Document should remain unchanged + assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + } + + public void testEmptyXml() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", ""); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + // Empty XML should result in null target + assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + } + + public void testWhitespaceOnlyXml() throws Exception { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", " \n\t "); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + // Whitespace-only XML should result in null target + assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + } + + public void testSelfClosingTag() throws Exception { + String xml = ""; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + assertThat(result.get("empty"), equalTo(null)); + } + + public void testSelfClosingTagWithAttributes() throws Exception { + String xml = ""; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + @SuppressWarnings("unchecked") + Map empty = (Map) result.get("empty"); + assertThat(empty.get("id"), equalTo("123")); + assertThat(empty.get("name"), equalTo("test")); + } + + public void testGetType() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + assertThat(processor.getType(), equalTo("xml")); + } + + public void testGetters() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, false); + assertThat(processor.getField(), equalTo("field")); + assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + } + + public void testIgnoreEmptyValueEnabled() throws Exception { + String xml = """ + + + William H. Gaddis + + One of the great seminal American novels. + + + + Some content + + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + // Empty title should be filtered out + assertThat(book.containsKey("title"), equalTo(false)); + // Empty element should be filtered out + assertThat(book.containsKey("empty"), equalTo(false)); + // empty_book should be filtered out entirely + assertThat(catalog.containsKey("empty_book"), equalTo(false)); + + // Valid content should remain + assertThat(book.get("author"), equalTo("William H. Gaddis")); + assertThat(book.get("review"), equalTo("One of the great seminal American novels.")); + + // Nested structure handling + @SuppressWarnings("unchecked") + Map nested = (Map) book.get("nested"); + assertThat(nested.containsKey("empty_text"), equalTo(false)); // Whitespace-only should be filtered + assertThat(nested.get("valid_content"), equalTo("Some content")); + } + + public void testIgnoreEmptyValueWithArrays() throws Exception { + String xml = """ + + + Valid Book + + + + + + Another Valid Book + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + List> books = (List>) catalog.get("book"); + + // Should have 2 books after filtering out the one with empty title + assertThat(books.size(), equalTo(2)); + assertThat(books.get(0).get("title"), equalTo("Valid Book")); + assertThat(books.get(1).get("title"), equalTo("Another Valid Book")); + } + + public void testIgnoreEmptyValueDisabled() throws Exception { + String xml = """ + + + William H. Gaddis + + + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); + Map document = new HashMap<>(); + document.put("message", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) messageField.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + + // Empty values should remain when ignore_empty_value is false + assertThat(book.containsKey("title"), equalTo(true)); + assertThat(book.get("title"), equalTo(null)); // Empty elements are parsed as null + assertThat(book.containsKey("empty"), equalTo(true)); + assertThat(book.get("empty"), equalTo(null)); + assertThat(book.get("author"), equalTo("William H. Gaddis")); + } + + public void testGettersWithIgnoreEmptyValue() { + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, true); + assertThat(processor.getField(), equalTo("field")); + assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + } + + public void testElementsWithAttributesAndTextContent() throws Exception { + String xml = """ + + + The Recognitions + William H. Gaddis + 29.99 + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) result.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + @SuppressWarnings("unchecked") + Map title = (Map) book.get("title"); + @SuppressWarnings("unchecked") + Map author = (Map) book.get("author"); + @SuppressWarnings("unchecked") + Map price = (Map) book.get("price"); + + // Test catalog with attributes only + assertThat(catalog.get("version"), equalTo("1.0")); + + // Test book with attributes only (no text content) + assertThat(book.get("id"), equalTo("123")); + assertThat(book.get("isbn"), equalTo("978-0-684-80335-9")); + + // Test elements with both attributes and text content (should use #text key) + assertThat(title.get("lang"), equalTo("en")); + assertThat(title.get("#text"), equalTo("The Recognitions")); + + assertThat(author.get("nationality"), equalTo("American")); + assertThat(author.get("#text"), equalTo("William H. Gaddis")); + + assertThat(price.get("currency"), equalTo("USD")); + assertThat(price.get("#text"), equalTo("29.99")); + } + + public void testMixedAttributesAndTextWithToLower() throws Exception { + String xml = """ + + + The Recognitions + William H. Gaddis + + """; + + XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, true, false); + Map document = new HashMap<>(); + document.put("field", xml); + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + processor.execute(ingestDocument); + + @SuppressWarnings("unchecked") + Map result = (Map) ingestDocument.getFieldValue("target", Object.class); + @SuppressWarnings("unchecked") + Map catalog = (Map) result.get("catalog"); + @SuppressWarnings("unchecked") + Map book = (Map) catalog.get("book"); + @SuppressWarnings("unchecked") + Map title = (Map) book.get("title"); + @SuppressWarnings("unchecked") + Map author = (Map) book.get("author"); + + // Test that element names are converted to lowercase + assertThat(catalog.get("version"), equalTo("1.0")); + assertThat(book.get("id"), equalTo("123")); + + // Test that attribute names are converted to lowercase but values remain unchanged + assertThat(title.get("lang"), equalTo("EN")); + assertThat(title.get("#text"), equalTo("The Recognitions")); + + assertThat(author.get("nationality"), equalTo("AMERICAN")); + assertThat(author.get("#text"), equalTo("William H. Gaddis")); + } +} From 16e129ec0f2e300476423fba43dab895b97ed0da Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 30 Jun 2025 14:37:21 +0000 Subject: [PATCH 2/9] [CI] Auto commit changes from spotless --- modules/ingest-common/src/main/java/module-info.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/main/java/module-info.java b/modules/ingest-common/src/main/java/module-info.java index ee98b7515e733..84e30519d2d1b 100644 --- a/modules/ingest-common/src/main/java/module-info.java +++ b/modules/ingest-common/src/main/java/module-info.java @@ -19,7 +19,7 @@ requires org.apache.logging.log4j; requires org.apache.lucene.analysis.common; requires org.jruby.joni; - + requires java.xml; exports org.elasticsearch.ingest.common; // for painless From 12f45606c2f0a86b646434037b9c13386ca66a31 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Mon, 30 Jun 2025 18:04:49 +0200 Subject: [PATCH 3/9] Make factory static --- .../ingest/common/XmlProcessor.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 897d36d7084b7..f3f1cc6d0a6a8 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -34,6 +34,8 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; + + private static final XMLInputFactory XML_INPUT_FACTORY = createXmlInputFactory(); private final String field; private final String targetField; @@ -157,6 +159,19 @@ private Object filterEmptyValues(Object obj) { return isEmptyValue(obj) ? null : obj; } + /** + * Creates and configures a secure XMLInputFactory for XML parsing. + * This factory is configured to prevent XXE attacks and optimize parsing. + */ + private static XMLInputFactory createXmlInputFactory() { + XMLInputFactory factory = XMLInputFactory.newInstance(); + factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); + factory.setProperty(XMLInputFactory.IS_COALESCING, true); + factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + return factory; + } + /** * Determines if a value should be considered empty for filtering purposes. */ @@ -181,14 +196,8 @@ private Object parseXml(String xmlString) throws XMLStreamException { return null; } - XMLInputFactory factory = XMLInputFactory.newInstance(); - factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); - factory.setProperty(XMLInputFactory.IS_COALESCING, true); - factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); - factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); - try (StringReader reader = new StringReader(xmlString)) { - XMLStreamReader xmlReader = factory.createXMLStreamReader(reader); + XMLStreamReader xmlReader = XML_INPUT_FACTORY.createXMLStreamReader(reader); // Skip to the first element while (xmlReader.hasNext() && xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) { From 0a7205960a8838fc2f0a02b30146f2d5b99e5ad0 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 30 Jun 2025 16:23:35 +0000 Subject: [PATCH 4/9] [CI] Auto commit changes from spotless --- .../main/java/org/elasticsearch/ingest/common/XmlProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index f3f1cc6d0a6a8..8122e88f458aa 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -34,7 +34,7 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; - + private static final XMLInputFactory XML_INPUT_FACTORY = createXmlInputFactory(); private final String field; From 3a5689e472523820f5b3af5deb31f135a799a759 Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Tue, 1 Jul 2025 10:34:15 +0200 Subject: [PATCH 5/9] Update docs/changelog/130337.yaml --- docs/changelog/130337.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog/130337.yaml diff --git a/docs/changelog/130337.yaml b/docs/changelog/130337.yaml new file mode 100644 index 0000000000000..2ea20ebd1944e --- /dev/null +++ b/docs/changelog/130337.yaml @@ -0,0 +1,6 @@ +pr: 130337 +summary: Add `XmlProcessor` initial implementation +area: Ingest Node +type: enhancement +issues: + - 97364 From 0ec3e67e81d1bd0e5285eec835fee4b8041e23eb Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Fri, 4 Jul 2025 15:56:28 +0200 Subject: [PATCH 6/9] feat: rewrite XML processor for Logstash feature parity - Replace XMLStreamReader with SAX parser + DOM for XPath support - Add XPath extraction, namespaces, strict parsing, content filtering - New options: force_array, force_content, remove_namespaces, store_xml - Enhanced security with XXE protection and pre-compiled XPath expressions - Full test coverage and updated documentation --- .../enrich-processor/xml-processor.md | 352 +++++- .../ingest/common/XmlProcessor.java | 958 ++++++++++++--- .../common/XmlProcessorFactoryTests.java | 376 +++++- .../ingest/common/XmlProcessorTests.java | 1094 +++++++++-------- 4 files changed, 1990 insertions(+), 790 deletions(-) diff --git a/docs/reference/enrich-processor/xml-processor.md b/docs/reference/enrich-processor/xml-processor.md index 0621a66d8edc6..7452fea12ed65 100644 --- a/docs/reference/enrich-processor/xml-processor.md +++ b/docs/reference/enrich-processor/xml-processor.md @@ -6,8 +6,7 @@ mapped_pages: # XML processor [xml-processor] - -Parses XML documents and converts them to JSON objects using a streaming XML parser. This processor efficiently handles XML data by avoiding loading the entire document into memory. +Parses XML documents and converts them to JSON objects using a DOM parser. This processor efficiently handles XML data with a single-parse architecture that supports both structured output and XPath extraction for optimal performance. $$$xml-options$$$ @@ -15,10 +14,17 @@ $$$xml-options$$$ | --- | --- | --- | --- | | `field` | yes | - | The field containing the XML string to be parsed. | | `target_field` | no | `field` | The field that the converted structured object will be written into. Any existing content in this field will be overwritten. | +| `store_xml` | no | `true` | If `true`, stores the parsed XML structure in the target field. If `false`, only XPath extraction results are stored. | | `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document. | -| `ignore_failure` | no | `false` | Ignore failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | -| `to_lower` | no | `false` | Convert XML element names to lowercase. | +| `ignore_failure` | no | `false` | Ignore failures for the processor. When `true` and XML parsing fails, adds `_xmlparsefailure` tag to the document. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | +| `to_lower` | no | `false` | Convert XML element names and attribute names to lowercase. | | `ignore_empty_value` | no | `false` | If `true`, the processor will filter out null and empty values from the parsed XML structure, including empty elements, elements with null values, and elements with whitespace-only content. | +| `remove_namespaces` | no | `false` | If `true`, removes namespace prefixes from element and attribute names. | +| `force_content` | no | `false` | If `true`, forces text content and attributes to always parse to a hash value with `#text` key for content. | +| `force_array` | no | `false` | If `true`, forces all parsed values to be arrays. Single elements are wrapped in arrays. | +| `parse_options` | no | - | Controls XML parsing behavior. Set to `"strict"` for strict XML validation that fails fast on invalid content. | +| `xpath` | no | - | Map of XPath expressions to target field names. Extracts values from the XML using XPath and stores them in the specified fields. | +| `namespaces` | no | - | Map of namespace prefixes to URIs for use with XPath expressions. Required when XPath expressions contain namespace prefixes. | | `description` | no | - | Description of the processor. Useful for describing the purpose of the processor or its configuration. | | `if` | no | - | Conditionally execute the processor. See [Conditionally run a processor](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#conditionally-run-processor). | | `on_failure` | no | - | Handle failures for the processor. See [Handling pipeline failures](docs-content://manage-data/ingest/transform-enrich/ingest-pipelines.md#handling-pipeline-failures). | @@ -69,9 +75,7 @@ Result: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "William H. GaddisThe RecognitionsOne of the great seminal American novels.", "catalog": { @@ -81,9 +85,6 @@ Result: "review": "One of the great seminal American novels." } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" } } } @@ -126,9 +127,7 @@ Result with empty elements filtered out: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "William H. GaddisOne of the great seminal American novels. Some content", "parsed_xml": { @@ -142,9 +141,6 @@ Result with empty elements filtered out: } } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" } } } @@ -184,9 +180,7 @@ Result: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "William H. GaddisThe Recognitions", "catalog": { @@ -195,9 +189,6 @@ Result: "title": "The Recognitions" } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" } } } @@ -238,9 +229,7 @@ Result: "docs": [ { "doc": { - "_index": "_index", - "_id": "_id", - "_version": "-3", + ... "_source": { "xml_content": "The RecognitionsWilliam H. Gaddis", "catalog": { @@ -258,9 +247,314 @@ Result: } } } - }, - "_ingest": { - "timestamp": "2019-03-11T21:54:37.909224Z" + } + } + } + ] +} +``` + +### XPath extraction + +The XML processor can extract specific values using XPath expressions and store them in designated fields: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "store_xml": false, + "xpath": { + "//book/title/text()": "book_title", + "//book/author/text()": "book_author", + "//book/@id": "book_id" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The RecognitionsWilliam H. Gaddis1984George Orwell", + "book_title": ["The Recognitions", "1984"], + "book_author": ["William H. Gaddis", "George Orwell"], + "book_id": ["123", "456"] + } + } + } + ] +} +``` + +### XPath with namespaces + +When working with XML that uses namespaces, you need to configure namespace mappings: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "namespaces": { + "book": "http://example.com/book", + "author": "http://example.com/author" + }, + "xpath": { + "//book:catalog/book:item/book:title/text()": "titles", + "//author:info/@name": "author_names" + } + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "titles": "The Recognitions", + "author_names": "William H. Gaddis", + "book:catalog": { + "book:item": { + "book:title": "The Recognitions", + "author:info": { + "name": "William H. Gaddis" + } + } + } + } + } + } + ] +} +``` + +### Force array behavior + +When `force_array` is true, all parsed values become arrays: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_array": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "catalog": [ + { + "book": [ + { + "title": ["The Recognitions"] + } + ] + } + ] + } + } + } + ] +} +``` + +### Strict parsing mode + +Use `parse_options: "strict"` for strict XML validation: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "parse_options": "strict", + "ignore_failure": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "Invalid XML with control character" + } + } + ] +} +``` + +Result (with parsing failure): + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "Invalid XML with control character", + "tags": ["_xmlparsefailure"] + } + } + } + ] +} +``` + +### Mixed content handling + +When XML contains mixed content (text interspersed with elements), text fragments are combined and stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content" + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "This text is bold and this is italic!" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "This text is bold and this is italic!", + "foo": { + "b": "bold", + "i": "italic", + "#text": "This text is and this is !" + } + } + } + } + ] +} +``` + +### Force content mode + +When `force_content` is `true`, all element text content is stored under the special `#text` key: + +```console +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "processors": [ + { + "xml": { + "field": "xml_content", + "force_content": true + } + } + ] + }, + "docs": [ + { + "_source": { + "xml_content": "The Recognitions" + } + } + ] +} +``` + +Result: + +```console-result +{ + "docs": [ + { + "doc": { + ... + "_source": { + "xml_content": "The Recognitions", + "book": { + "author": "William H. Gaddis", + "#text": "The Recognitions" + } } } } @@ -278,4 +572,4 @@ The XML processor supports: - **Repeated elements**: Converted to arrays when multiple elements with the same name exist at the same level - **XML attributes**: Included as properties in the JSON object alongside element content. When an element has both attributes and text content, the text is stored under a special `#text` key - **Mixed content**: Elements with both text and child elements include text under a special `#text` key while attributes and child elements become object properties -- **Namespaces**: Local names are used, namespace prefixes are ignored +- **Namespaces**: Namespace prefixes are preserved by default and can be used in XPath expressions with the `namespaces` configuration. Use `remove_namespaces: true` to strip namespace prefixes from element names diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 8122e88f458aa..582b8d9f3e787 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -15,27 +15,51 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; -import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; -import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamConstants; -import javax.xml.stream.XMLStreamException; -import javax.xml.stream.XMLStreamReader; +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; /** - * Processor that parses XML documents and converts them to JSON objects using streaming XML parser. - * This implementation uses XMLStreamReader for efficient parsing and avoids loading the entire document in memory. + * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. + * + * Features: + * - XML to JSON conversion with configurable structure options + * - XPath extraction with namespace support + * - Configurable options: force_array, force_content, remove_namespaces, to_lower + * - Strict parsing mode for XML validation + * - Empty value filtering with ignore_empty_value option + * - Logstash-compatible error handling and behavior */ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; - - private static final XMLInputFactory XML_INPUT_FACTORY = createXmlInputFactory(); + + private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); + + // Pre-configured SAX parser factories for secure XML parsing + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); + + // Pre-configured document builder factory for DOM creation + private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); private final String field; private final String targetField; @@ -43,6 +67,14 @@ public final class XmlProcessor extends AbstractProcessor { private final boolean ignoreFailure; private final boolean toLower; private final boolean ignoreEmptyValue; + private final boolean storeXml; + private final boolean removeNamespaces; + private final boolean forceContent; + private final boolean forceArray; + private final Map xpathExpressions; + private final Map namespaces; + private final Map compiledXPathExpressions; + private final String parseOptions; XmlProcessor( String tag, @@ -52,7 +84,14 @@ public final class XmlProcessor extends AbstractProcessor { boolean ignoreMissing, boolean ignoreFailure, boolean toLower, - boolean ignoreEmptyValue + boolean ignoreEmptyValue, + boolean storeXml, + boolean removeNamespaces, + boolean forceContent, + boolean forceArray, + Map xpathExpressions, + Map namespaces, + String parseOptions ) { super(tag, description); this.field = field; @@ -61,6 +100,14 @@ public final class XmlProcessor extends AbstractProcessor { this.ignoreFailure = ignoreFailure; this.toLower = toLower; this.ignoreEmptyValue = ignoreEmptyValue; + this.storeXml = storeXml; + this.removeNamespaces = removeNamespaces; + this.forceContent = forceContent; + this.forceArray = forceArray; + this.xpathExpressions = xpathExpressions != null ? Map.copyOf(xpathExpressions) : Map.of(); + this.namespaces = namespaces != null ? Map.copyOf(namespaces) : Map.of(); + this.compiledXPathExpressions = compileXPathExpressions(this.xpathExpressions, this.namespaces); + this.parseOptions = parseOptions != null ? parseOptions : ""; } public String getField() { @@ -79,13 +126,46 @@ public boolean isIgnoreEmptyValue() { return ignoreEmptyValue; } + public boolean isStoreXml() { + return storeXml; + } + + public boolean isRemoveNamespaces() { + return removeNamespaces; + } + + public boolean isForceContent() { + return forceContent; + } + + public boolean isStrict() { + return "strict".equals(parseOptions); + } + + public boolean isForceArray() { + return forceArray; + } + + public boolean hasNamespaces() { + return namespaces.isEmpty() == false; + } + + public Map getNamespaces() { + return namespaces; + } + + public String getParseOptions() { + return parseOptions; + } + @Override public IngestDocument execute(IngestDocument document) { Object fieldValue = document.getFieldValue(field, Object.class, ignoreMissing); - if (fieldValue == null && ignoreMissing) { - return document; - } else if (fieldValue == null) { + if (fieldValue == null) { + if (ignoreMissing || ignoreFailure) { + return document; + } throw new IllegalArgumentException("field [" + field + "] is null, cannot parse XML"); } @@ -98,13 +178,14 @@ public IngestDocument execute(IngestDocument document) { String xmlString = (String) fieldValue; try { - Object parsedXml = parseXml(xmlString.trim()); - if (ignoreEmptyValue) { - parsedXml = filterEmptyValues(parsedXml); + // Always use streaming parser for optimal performance and memory usage + if (storeXml || xpathExpressions.isEmpty() == false) { + parseXmlAndXPath(document, xmlString.trim()); } - document.setFieldValue(targetField, parsedXml); } catch (Exception e) { if (ignoreFailure) { + // Add failure tag similar to Logstash behavior + document.appendFieldValue("tags", "_xmlparsefailure"); return document; } throw new IllegalArgumentException("field [" + field + "] contains invalid XML: " + e.getMessage(), e); @@ -118,62 +199,18 @@ public String getType() { return TYPE; } - /** - * Recursively removes null and empty values from the parsed XML structure - * when ignoreEmptyValue is enabled. - */ - @SuppressWarnings("unchecked") - private Object filterEmptyValues(Object obj) { - if (obj == null) { - return null; - } - - if (obj instanceof Map) { - Map map = (Map) obj; - Map filtered = new HashMap<>(); - - for (Map.Entry entry : map.entrySet()) { - Object filteredValue = filterEmptyValues(entry.getValue()); - if (filteredValue != null && isEmptyValue(filteredValue) == false) { - filtered.put(entry.getKey(), filteredValue); - } - } - - return filtered.isEmpty() ? null : filtered; - } - - if (obj instanceof List) { - List list = (List) obj; - List filtered = new ArrayList<>(); - - for (Object item : list) { - Object filteredItem = filterEmptyValues(item); - if (filteredItem != null && isEmptyValue(filteredItem) == false) { - filtered.add(filteredItem); - } - } - - return filtered.isEmpty() ? null : filtered; - } - - return isEmptyValue(obj) ? null : obj; - } - - /** - * Creates and configures a secure XMLInputFactory for XML parsing. - * This factory is configured to prevent XXE attacks and optimize parsing. - */ - private static XMLInputFactory createXmlInputFactory() { - XMLInputFactory factory = XMLInputFactory.newInstance(); - factory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); - factory.setProperty(XMLInputFactory.IS_COALESCING, true); - factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); - factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); - return factory; - } - /** * Determines if a value should be considered empty for filtering purposes. + * Used by the ignore_empty_value feature to filter out empty content. + * + * Considers empty: + * - null values + * - empty or whitespace-only strings + * - empty Maps + * - empty Lists + * + * @param value the value to check + * @return true if the value should be considered empty */ private boolean isEmptyValue(Object value) { if (value == null) { @@ -191,139 +228,187 @@ private boolean isEmptyValue(Object value) { return false; } - private Object parseXml(String xmlString) throws XMLStreamException { - if (xmlString == null || xmlString.trim().isEmpty()) { + /** + * Extract the text value from a DOM node for XPath result processing. + * Handles different node types appropriately: + * - TEXT_NODE and CDATA_SECTION_NODE: returns node value directly + * - ATTRIBUTE_NODE: returns attribute value + * - ELEMENT_NODE: returns text content (concatenated text of all descendants) + * - Other node types: returns text content as fallback + * + * @param node the DOM node to extract text from + * @return the text content of the node, or null if node is null + */ + private String getNodeValue(Node node) { + if (node == null) { return null; } - - try (StringReader reader = new StringReader(xmlString)) { - XMLStreamReader xmlReader = XML_INPUT_FACTORY.createXMLStreamReader(reader); - - // Skip to the first element - while (xmlReader.hasNext() && xmlReader.getEventType() != XMLStreamConstants.START_ELEMENT) { - xmlReader.next(); - } - - if (xmlReader.hasNext() == false) { - return null; - } - - Object result = parseElement(xmlReader); - xmlReader.close(); - return result; + + switch (node.getNodeType()) { + case Node.ATTRIBUTE_NODE: + case Node.CDATA_SECTION_NODE: + case Node.TEXT_NODE: + return node.getNodeValue(); + case Node.ELEMENT_NODE: + default: + return node.getTextContent(); } } - private Object parseElement(XMLStreamReader reader) throws XMLStreamException { - if (reader.getEventType() != XMLStreamConstants.START_ELEMENT) { - return null; - } - - String elementName = reader.getLocalName(); - if (toLower) { - elementName = elementName.toLowerCase(Locale.ROOT); - } - - Map element = new HashMap<>(); - Map> repeatedElements = new HashMap<>(); - - // Parse attributes - they are available in START_ELEMENT state - int attributeCount = reader.getAttributeCount(); - boolean hasAttributes = attributeCount > 0; - for (int i = 0; i < attributeCount; i++) { - String attrName = reader.getAttributeLocalName(i); - String attrValue = reader.getAttributeValue(i); - if (toLower) { - attrName = attrName.toLowerCase(Locale.ROOT); - } - element.put(attrName, attrValue); + /** + * Applies force_array logic to ensure all fields are arrays when enabled. + * + * Behavior: + * - If force_array is false: returns content unchanged + * - If force_array is true and content is already a List: returns content unchanged + * - If force_array is true and content is not a List: wraps content in a new ArrayList + * - Handles null content appropriately (wraps null in array if force_array is true) + * + * @param elementName the name of the element (for context, not used in current implementation) + * @param content the content to potentially wrap in an array + * @return the content, optionally wrapped in an array based on force_array setting + */ + private Object applyForceArray(String elementName, Object content) { + if (forceArray && !(content instanceof List)) { + List arrayContent = new ArrayList<>(); + arrayContent.add(content); // Add content even if it's null (for empty elements) + return arrayContent; } + return content; + } - StringBuilder textContent = new StringBuilder(); - - while (reader.hasNext()) { - int eventType = reader.next(); - - switch (eventType) { - case XMLStreamConstants.START_ELEMENT: - Object childElementResult = parseElement(reader); - String childName = reader.getLocalName(); - if (toLower) { - childName = childName.toLowerCase(Locale.ROOT); - } - - // Extract the actual content from the child element result - Object childContent = null; - if (childElementResult instanceof Map) { - @SuppressWarnings("unchecked") - Map childMap = (Map) childElementResult; - // The child element returns {elementName: content}, we want just the content - childContent = childMap.get(childName); - } else { - childContent = childElementResult; + /** + * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. + * + * Features: + * - Uses precompiled XPath expressions for optimal performance + * - Extracts text values from matched nodes (elements, attributes, text nodes) + * - Single matches stored as strings, multiple matches as string arrays + * - Respects ignoreFailure setting for XPath evaluation errors + * + * @param document the ingest document to add XPath results to + * @param doc the DOM document to evaluate XPath expressions against + * @throws Exception if XPath processing fails and ignoreFailure is false + */ + private void processXPathExpressionsFromDom(IngestDocument document, Document doc) throws Exception { + // Use precompiled XPath expressions for optimal performance + for (Map.Entry entry : compiledXPathExpressions.entrySet()) { + String targetFieldName = entry.getKey(); + XPathExpression compiledExpression = entry.getValue(); + + try { + Object result = compiledExpression.evaluate(doc, XPathConstants.NODESET); + + if (result instanceof NodeList) { + NodeList nodeList = (NodeList) result; + List values = new ArrayList<>(); + + for (int i = 0; i < nodeList.getLength(); i++) { + Node node = nodeList.item(i); + String value = getNodeValue(node); + if (value != null && value.trim().isEmpty() == false) { + values.add(value); + } } - - if (element.containsKey(childName) || repeatedElements.containsKey(childName)) { - // Handle repeated elements - if (repeatedElements.containsKey(childName) == false) { - List list = new ArrayList<>(); - list.add(element.get(childName)); - repeatedElements.put(childName, list); - element.remove(childName); + + if (values.isEmpty() == false) { + if (values.size() == 1) { + document.setFieldValue(targetFieldName, values.get(0)); + } else { + document.setFieldValue(targetFieldName, values); } - repeatedElements.get(childName).add(childContent); - } else { - element.put(childName, childContent); } - break; + } + } catch (XPathExpressionException e) { + if (ignoreFailure == false) { + throw new IllegalArgumentException("XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), e); + } + } + } + } - case XMLStreamConstants.CHARACTERS: - String text = reader.getText(); - if (text != null && text.trim().isEmpty() == false) { - textContent.append(text); + /** + * Compiles XPath expressions at processor creation time for optimal runtime performance. + * This method pre-compiles all configured XPath expressions with appropriate namespace context, + * eliminating the compilation overhead during document processing. + * + * @param xpathExpressions map of XPath expressions to target field names + * @param namespaces map of namespace prefixes to URIs + * @return map of compiled XPath expressions keyed by target field name + * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails + */ + private static Map compileXPathExpressions( + Map xpathExpressions, + Map namespaces + ) { + if (xpathExpressions.isEmpty()) { + return Map.of(); + } + + Map compiled = new HashMap<>(); + XPath xpath = XPATH_FACTORY.newXPath(); + + // Set namespace context if namespaces are defined + boolean hasNamespaces = namespaces.isEmpty() == false; + if (hasNamespaces) { + xpath.setNamespaceContext(new NamespaceContext() { + @Override + public String getNamespaceURI(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("Prefix cannot be null"); } - break; + return namespaces.getOrDefault(prefix, ""); + } - case XMLStreamConstants.END_ELEMENT: - // Add repeated elements as arrays - for (Map.Entry> entry : repeatedElements.entrySet()) { - element.put(entry.getKey(), entry.getValue()); + @Override + public String getPrefix(String namespaceURI) { + for (Map.Entry entry : namespaces.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + return entry.getKey(); + } } + return null; + } - // Determine what to return - String trimmedText = textContent.toString().trim(); - boolean hasText = trimmedText.isEmpty() == false; - boolean hasChildren = element.size() > attributeCount; // Children beyond attributes - - Map result = new HashMap<>(); - if (hasText == false && hasChildren == false && hasAttributes == false) { - // Empty element - result.put(elementName, null); - return result; - } else if (hasText && hasChildren == false) { - // Only text content (and possibly attributes) - if (hasAttributes) { - element.put("#text", trimmedText); - result.put(elementName, element); - return result; - } else { - result.put(elementName, trimmedText); - return result; + @Override + public Iterator getPrefixes(String namespaceURI) { + List prefixes = new ArrayList<>(); + for (Map.Entry entry : namespaces.entrySet()) { + if (entry.getValue().equals(namespaceURI)) { + prefixes.add(entry.getKey()); } - } else if (hasText == false && hasChildren) { - // Only child elements (and possibly attributes) - result.put(elementName, element); - return result; - } else { - // Both text and children (and possibly attributes) - element.put("#text", trimmedText); - result.put(elementName, element); - return result; } + return prefixes.iterator(); + } + }); + } + + // Pre-compiled pattern to detect namespace prefixes + java.util.regex.Pattern namespacePattern = + java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + + for (Map.Entry entry : xpathExpressions.entrySet()) { + String xpathExpression = entry.getKey(); + String targetFieldName = entry.getValue(); + + // Validate namespace prefixes if no namespaces are configured + if (!hasNamespaces && namespacePattern.matcher(xpathExpression).matches()) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + xpathExpression + "]: contains namespace prefixes but no namespace configuration provided" + ); + } + + try { + XPathExpression compiledExpression = xpath.compile(xpathExpression); + compiled.put(targetFieldName, compiledExpression); + } catch (XPathExpressionException e) { + throw new IllegalArgumentException( + "Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e + ); } } - - return null; + + return Map.copyOf(compiled); } public static final class Factory implements Processor.Factory { @@ -342,8 +427,509 @@ public XmlProcessor create( boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_failure", false); boolean toLower = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "to_lower", false); boolean ignoreEmptyValue = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_empty_value", false); + boolean storeXml = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "store_xml", true); + boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); + boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); + boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); + + // Parse XPath expressions map + Map xpathExpressions = new HashMap<>(); + Object xpathConfig = config.get("xpath"); + if (xpathConfig != null) { + if (xpathConfig instanceof Map) { + @SuppressWarnings("unchecked") + Map xpathMap = (Map) xpathConfig; + for (Map.Entry entry : xpathMap.entrySet()) { + if (entry.getValue() instanceof String) { + xpathExpressions.put(entry.getKey(), (String) entry.getValue()); + } else { + throw new IllegalArgumentException( + "XPath target field [" + entry.getKey() + "] must be a string, got [" + entry.getValue().getClass().getSimpleName() + "]" + ); + } + } + } else { + throw new IllegalArgumentException("XPath configuration must be a map of expressions to target fields"); + } + } + + // Parse namespaces map + Map namespaces = new HashMap<>(); + Object namespaceConfig = config.get("namespaces"); + if (namespaceConfig != null) { + if (namespaceConfig instanceof Map) { + @SuppressWarnings("unchecked") + Map namespaceMap = (Map) namespaceConfig; + for (Map.Entry entry : namespaceMap.entrySet()) { + if (entry.getValue() instanceof String) { + namespaces.put(entry.getKey(), (String) entry.getValue()); + } else { + throw new IllegalArgumentException( + "Namespace prefix [" + entry.getKey() + "] must have a string URI, got [" + entry.getValue().getClass().getSimpleName() + "]" + ); + } + } + } else { + throw new IllegalArgumentException("Namespaces configuration must be a map of prefixes to URIs"); + } + } + + // Parse parse_options parameter + String parseOptions = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "parse_options", ""); + if (parseOptions != null && parseOptions != "" && !"strict".equals(parseOptions)) { + throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); + } - return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue); + return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue, storeXml, removeNamespaces, forceContent, forceArray, xpathExpressions, namespaces, parseOptions); + } + } + + /** + * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. + * Uses streaming SAX parser with optional DOM building for XPath processing. + * + * @param document the ingest document to modify with parsed results + * @param xmlString the XML string to parse (should be trimmed) + * @throws Exception if XML parsing fails + */ + private void parseXmlAndXPath(IngestDocument document, String xmlString) throws Exception { + if (xmlString == null || xmlString.trim().isEmpty()) { + return; + } + + // Determine if we need DOM for XPath processing + boolean needsDom = xpathExpressions.isEmpty() == false; + + // Use the appropriate pre-configured SAX parser factory + javax.xml.parsers.SAXParserFactory factory = selectSaxParserFactory(); + + javax.xml.parsers.SAXParser parser = factory.newSAXParser(); + + // Configure error handler for strict mode + if (isStrict()) { + parser.getXMLReader().setErrorHandler(new org.xml.sax.ErrorHandler() { + @Override + public void warning(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + + @Override + public void error(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + + @Override + public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { + throw exception; + } + }); + } + + // Use enhanced handler that can build DOM during streaming when needed + XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); + + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes("UTF-8")), handler); + + // Store structured result if needed + if (storeXml) { + Object streamingResult = handler.getStructuredResult(); + if (streamingResult != null) { + document.setFieldValue(targetField, streamingResult); + } + } + + // Process XPath expressions if DOM was built during streaming + if (needsDom) { + Document domDocument = handler.getDomDocument(); + if (domDocument != null) { + processXPathExpressionsFromDom(document, domDocument); + } + } + } + + /** + * SAX ContentHandler that builds structured JSON output and optionally constructs a DOM tree during parsing. + * Handles XML-to-JSON conversion with support for all processor configuration options. + */ + private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHandler { + // Streaming parser state (for structured output) + private final java.util.Deque> elementStack = new java.util.ArrayDeque<>(); + private final java.util.Deque elementNameStack = new java.util.ArrayDeque<>(); + private final java.util.Deque textStack = new java.util.ArrayDeque<>(); + private final java.util.Deque>> repeatedElementsStack = new java.util.ArrayDeque<>(); + private Object rootResult = null; + + // DOM building state (for XPath processing when needed) + private final boolean buildDom; + private Document domDocument = null; + private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); + + public XmlStreamingWithDomHandler(boolean buildDom) { + this.buildDom = buildDom; + } + + @Override + public void startDocument() throws org.xml.sax.SAXException { + // Initialize DOM document if needed + if (buildDom) { + try { + // Use pre-configured secure DOM factory + // Since we build DOM programmatically (createElementNS/createElement), + // the factory's namespace awareness doesn't affect our usage + DocumentBuilder builder = DOM_FACTORY.newDocumentBuilder(); + domDocument = builder.newDocument(); + } catch (Exception e) { + throw new org.xml.sax.SAXException("Failed to create DOM document", e); + } + } + } + + @Override + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException { + String elementName = getElementName(uri, localName, qName); + + // Build structured representation (always) + Map element = new HashMap<>(); + Map> repeatedElements = new HashMap<>(); + + // Process attributes for structured output + for (int i = 0; i < attributes.getLength(); i++) { + String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); + String attrValue = attributes.getValue(i); + + // Apply ignoreEmptyValue filtering to attributes + if (ignoreEmptyValue == false || isEmptyValue(attrValue) == false) { + element.put(attrName, attrValue); + } + } + + elementStack.push(element); + elementNameStack.push(elementName); + textStack.push(new StringBuilder()); + repeatedElementsStack.push(repeatedElements); + + // Build DOM element simultaneously if needed + if (buildDom && domDocument != null) { + org.w3c.dom.Element domElement; + if (uri != null && !uri.isEmpty() && !removeNamespaces) { + domElement = domDocument.createElementNS(uri, qName); + } else { + domElement = domDocument.createElement(removeNamespaces ? localName : qName); + } + + // Add attributes to DOM element + for (int i = 0; i < attributes.getLength(); i++) { + String attrUri = attributes.getURI(i); + String attrLocalName = attributes.getLocalName(i); + String attrQName = attributes.getQName(i); + String attrValue = attributes.getValue(i); + + if (attrUri != null && !attrUri.isEmpty() && !removeNamespaces) { + domElement.setAttributeNS(attrUri, attrQName, attrValue); + } else { + domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); + } + } + + // Add to parent or root + if (domElementStack.isEmpty()) { + domDocument.appendChild(domElement); + } else { + domElementStack.peek().appendChild(domElement); + } + + domElementStack.push(domElement); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { + // Add to structured output text accumulator + if (!textStack.isEmpty()) { + textStack.peek().append(ch, start, length); + } + + // Add to DOM text node if needed + if (buildDom && !domElementStack.isEmpty()) { + String text = new String(ch, start, length); + if (!text.trim().isEmpty() || !ignoreEmptyValue) { + org.w3c.dom.Text textNode = domDocument.createTextNode(text); + domElementStack.peek().appendChild(textNode); + } + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { + // Complete structured output element processing + if (elementStack.isEmpty()) { + return; + } + + Map element = elementStack.pop(); + String elementName = elementNameStack.pop(); + StringBuilder textContent = textStack.pop(); + Map> repeatedElements = repeatedElementsStack.pop(); + + // Add repeated elements as arrays + for (Map.Entry> entry : repeatedElements.entrySet()) { + List values = entry.getValue(); + if (ignoreEmptyValue == false || values.isEmpty() == false) { + element.put(entry.getKey(), values); + } + } + + // Process text content and determine final element structure + String trimmedText = textContent.toString().trim(); + boolean hasText = trimmedText.isEmpty() == false; + boolean hasChildren = element.size() > 0; + + Object elementValue; + if (hasText == false && hasChildren == false) { + // Empty element + if (ignoreEmptyValue == false) { + elementValue = applyForceArray(elementName, null); + } else { + elementValue = null; + } + } else if (hasText && hasChildren == false) { + // Only text content + if (forceContent) { + Map contentMap = new HashMap<>(); + if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + contentMap.put("#text", trimmedText); + } + elementValue = contentMap; + } else { + if (ignoreEmptyValue && isEmptyValue(trimmedText)) { + elementValue = null; + } else { + elementValue = trimmedText; + } + } + elementValue = applyForceArray(elementName, elementValue); + } else if (hasText == false && hasChildren) { + // Only child elements/attributes + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } else { + // Both text and children/attributes + if (ignoreEmptyValue == false || isEmptyValue(trimmedText) == false) { + element.put("#text", trimmedText); + } + elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; + } + + // If this is the root element, store the result + if (elementStack.isEmpty()) { + if (elementValue != null) { + Map result = new HashMap<>(); + result.put(elementName, elementValue); + rootResult = result; + } + } else { + // Add to parent element + if (elementValue != null) { + Map parentElement = elementStack.peek(); + Map> parentRepeatedElements = repeatedElementsStack.peek(); + + if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { + // Handle repeated elements + if (parentRepeatedElements.containsKey(elementName) == false) { + List list = new ArrayList<>(); + list.add(parentElement.get(elementName)); + parentRepeatedElements.put(elementName, list); + parentElement.remove(elementName); + } + parentRepeatedElements.get(elementName).add(elementValue); + } else { + // Apply force_array logic for single elements + Object finalContent = applyForceArray(elementName, elementValue); + parentElement.put(elementName, finalContent); + } + } + } + + // Complete DOM element if building DOM + if (buildDom && !domElementStack.isEmpty()) { + domElementStack.pop(); + } + } + + @Override + public void endDocument() throws org.xml.sax.SAXException { + // Document parsing complete + } + + public Object getStructuredResult() { + return rootResult; + } + + public Document getDomDocument() { + return domDocument; + } + + private String getElementName(String uri, String localName, String qName) { + String elementName; + if (removeNamespaces) { + elementName = localName != null && !localName.isEmpty() ? localName : qName; + } else { + elementName = qName; + } + + // Apply toLower if enabled + if (toLower) { + elementName = elementName.toLowerCase(Locale.ROOT); + } + + return elementName; + } + + private String getAttributeName(String uri, String localName, String qName) { + String attrName; + if (removeNamespaces) { + attrName = localName != null && !localName.isEmpty() ? localName : qName; + } else { + attrName = qName; + } + + // Apply toLower if enabled + if (toLower) { + attrName = attrName.toLowerCase(Locale.ROOT); + } + + return attrName; + } + } + + /** + * Creates a secure, pre-configured SAX parser factory for XML parsing. + * This factory is configured to prevent XXE attacks with SAX-specific features. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. + * This factory is configured to prevent XXE attacks and has namespace awareness enabled. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAware() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured SAX parser factory for strict XML parsing. + * This factory is configured to prevent XXE attacks and has strict validation enabled. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryStrict() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Enable strict parsing features + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. + * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. + */ + private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNamespaceAwareStrict() { + javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + + // Configure SAX-specific security features to prevent XXE attacks + try { + // SAX parser features - these are the correct features for SAXParserFactory + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Enable strict parsing features + factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); + } catch (Exception e) { + // If features cannot be set, continue with default settings + } + + return factory; + } + + /** + * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation. + * Since we only use this factory to create empty DOM documents programmatically + * (not to parse XML), XXE security features are not needed here. + * The SAX parser handles all XML parsing with appropriate security measures. + */ + private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); // Enable for maximum compatibility + factory.setValidating(false); + + // No XXE security features needed - we only create empty documents, + // never parse XML with this factory + + return factory; + } + + /** + * Selects the appropriate pre-configured SAX parser factory based on processor configuration. + * + * Factory selection matrix: + * - Regular parsing, no namespaces: SAX_PARSER_FACTORY + * - Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS + * - Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT + * - Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT + * + * @return the appropriate SAX parser factory for the current configuration + */ + private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { + if (isStrict()) { + return hasNamespaces() ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; + } else { + return hasNamespaces() ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; } } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index ba1ba839a9edc..d0f4437074359 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -16,67 +16,377 @@ import java.util.Map; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.containsString; public class XmlProcessorFactoryTests extends ESTestCase { - public void testCreate() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); + private static final String DEFAULT_FIELD = "field1"; + private static final String DEFAULT_TARGET_FIELD = "target"; + + /** + * Creates a new XmlProcessor.Factory instance for testing. + */ + private XmlProcessor.Factory createFactory() { + return new XmlProcessor.Factory(); + } + + /** + * Creates a basic configuration map with the specified field name. + */ + private Map createBaseConfig(String fieldName) { Map config = new HashMap<>(); - config.put("field", "field1"); - config.put("target_field", "target"); + config.put("field", fieldName); + return config; + } + + /** + * Creates a basic configuration map with the default field name. + */ + private Map createBaseConfig() { + return createBaseConfig(DEFAULT_FIELD); + } + + /** + * Creates a configuration map with XPath expressions. + */ + private Map createConfigWithXPath(String fieldName, Map xpathExpressions) { + Map config = createBaseConfig(fieldName); + config.put("xpath", xpathExpressions); + return config; + } + + /** + * Creates a configuration map with namespace definitions. + */ + private Map createConfigWithNamespaces(String fieldName, Map namespaces) { + Map config = createBaseConfig(fieldName); + config.put("namespaces", namespaces); + return config; + } + + /** + * Creates a configuration map with both XPath expressions and namespaces. + */ + private Map createConfigWithXPathAndNamespaces( + String fieldName, + Map xpathExpressions, + Map namespaces + ) { + Map config = createBaseConfig(fieldName); + config.put("xpath", xpathExpressions); + config.put("namespaces", namespaces); + return config; + } + + /** + * Creates a processor with the given factory and configuration. + */ + private XmlProcessor createProcessor(XmlProcessor.Factory factory, Map config) throws Exception { + String processorTag = randomAlphaOfLength(10); + return factory.create(null, processorTag, null, config, null); + } + + /** + * Creates a processor with the default factory and given configuration. + */ + private XmlProcessor createProcessor(Map config) throws Exception { + return createProcessor(createFactory(), config); + } + + /** + * Helper method to create XPath configuration map. + */ + private Map createXPathConfig(String... expressionsAndFields) { + if (expressionsAndFields.length % 2 != 0) { + throw new IllegalArgumentException("Must provide even number of arguments (expression, field, expression, field, ...)"); + } + + Map xpathConfig = new HashMap<>(); + for (int i = 0; i < expressionsAndFields.length; i += 2) { + xpathConfig.put(expressionsAndFields[i], expressionsAndFields[i + 1]); + } + return xpathConfig; + } + + /** + * Helper method to create namespace configuration map. + */ + private Map createNamespaceConfig(String... prefixesAndUris) { + if (prefixesAndUris.length % 2 != 0) { + throw new IllegalArgumentException("Must provide even number of arguments (prefix, uri, prefix, uri, ...)"); + } + + Map namespaceConfig = new HashMap<>(); + for (int i = 0; i < prefixesAndUris.length; i += 2) { + namespaceConfig.put(prefixesAndUris[i], prefixesAndUris[i + 1]); + } + return namespaceConfig; + } + + /** + * Helper method to create configuration with common boolean options. + */ + private Map createConfigWithOptions(String fieldName, String... options) { + Map config = createBaseConfig(fieldName); + + for (String option : options) { + switch (option) { + case "ignore_missing": + config.put("ignore_missing", true); + break; + case "ignore_failure": + config.put("ignore_failure", true); + break; + case "to_lower": + config.put("to_lower", true); + break; + case "ignore_empty_value": + config.put("ignore_empty_value", true); + break; + case "store_xml": + config.put("store_xml", false); // Test false case since default is true + break; + case "remove_namespaces": + config.put("remove_namespaces", true); + break; + case "force_content": + config.put("force_content", true); + break; + case "force_array": + config.put("force_array", true); + break; + case "strict": + config.put("parse_options", "strict"); + break; + default: + throw new IllegalArgumentException("Unknown option: " + option); + } + } + + return config; + } + + /** + * Helper to expect processor creation failure with specific message. + */ + private void expectCreationFailure(Map config, Class exceptionClass, String expectedMessage) { + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + Exception exception = expectThrows( + exceptionClass, + () -> factory.create(null, processorTag, null, config, null) + ); + assertThat(exception.getMessage(), equalTo(expectedMessage)); + } + + /** + * Tests processor creation with various configurations. + */ + public void testCreate() throws Exception { + Map config = createBaseConfig(); + config.put("target_field", DEFAULT_TARGET_FIELD); config.put("ignore_missing", true); config.put("ignore_failure", true); config.put("to_lower", true); config.put("ignore_empty_value", true); - String processorTag = randomAlphaOfLength(10); - XmlProcessor processor = factory.create(null, processorTag, null, config, null); + XmlProcessor processor = createProcessor(config); - assertThat(processor.getTag(), equalTo(processorTag)); - assertThat(processor.getField(), equalTo("field1")); - assertThat(processor.getTargetField(), equalTo("target")); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_TARGET_FIELD)); assertThat(processor.isIgnoreMissing(), equalTo(true)); assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); } public void testCreateWithDefaults() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); - Map config = new HashMap<>(); - config.put("field", "field1"); - - String processorTag = randomAlphaOfLength(10); - XmlProcessor processor = factory.create(null, processorTag, null, config, null); + Map config = createBaseConfig(); + XmlProcessor processor = createProcessor(config); - assertThat(processor.getTag(), equalTo(processorTag)); - assertThat(processor.getField(), equalTo("field1")); - assertThat(processor.getTargetField(), equalTo("field1")); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getTargetField(), equalTo(DEFAULT_FIELD)); assertThat(processor.isIgnoreMissing(), equalTo(false)); assertThat(processor.isIgnoreEmptyValue(), equalTo(false)); } public void testCreateMissingField() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); - Map config = new HashMap<>(); - - String processorTag = randomAlphaOfLength(10); - ElasticsearchParseException exception = expectThrows( - ElasticsearchParseException.class, - () -> factory.create(null, processorTag, null, config, null) - ); - assertThat(exception.getMessage(), equalTo("[field] required property is missing")); + Map config = new HashMap<>(); // Empty config - no field specified + expectCreationFailure(config, ElasticsearchParseException.class, "[field] required property is missing"); } public void testCreateWithIgnoreEmptyValueOnly() throws Exception { - XmlProcessor.Factory factory = new XmlProcessor.Factory(); - Map config = new HashMap<>(); - config.put("field", "field1"); + Map config = createBaseConfig(); config.put("ignore_empty_value", true); - String processorTag = randomAlphaOfLength(10); - XmlProcessor processor = factory.create(null, processorTag, null, config, null); + XmlProcessor processor = createProcessor(config); - assertThat(processor.getField(), equalTo("field1")); + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); assertThat(processor.isIgnoreMissing(), equalTo(false)); // other flags should remain default } + + public void testCreateWithXPath() throws Exception { + Map xpathConfig = createXPathConfig( + "//author/text()", "author_field", + "//title/@lang", "language_field" + ); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + } + + public void testCreateWithInvalidXPathConfig() throws Exception { + Map config = createBaseConfig(); + config.put("xpath", "invalid_string"); // Should be a map + + expectCreationFailure(config, IllegalArgumentException.class, "XPath configuration must be a map of expressions to target fields"); + } + + public void testCreateWithInvalidXPathTargetField() throws Exception { + Map config = createBaseConfig(); + + Map xpathConfig = new HashMap<>(); + xpathConfig.put("//author/text()", 123); // Should be string + config.put("xpath", xpathConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "XPath target field [//author/text()] must be a string, got [Integer]"); + } + + public void testCreateWithNamespaces() throws Exception { + Map namespacesConfig = createNamespaceConfig( + "book", "http://example.com/book", + "author", "http://example.com/author" + ); + Map config = createConfigWithNamespaces(DEFAULT_FIELD, namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + public void testCreateWithInvalidNamespacesConfig() throws Exception { + Map config = createBaseConfig(); + config.put("namespaces", "invalid_string"); // Should be a map + + expectCreationFailure(config, IllegalArgumentException.class, "Namespaces configuration must be a map of prefixes to URIs"); + } + + public void testCreateWithInvalidNamespaceURI() throws Exception { + Map config = createBaseConfig(); + + Map namespacesConfig = new HashMap<>(); + namespacesConfig.put("book", 123); // Should be string + config.put("namespaces", namespacesConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "Namespace prefix [book] must have a string URI, got [Integer]"); + } + + public void testCreateWithXPathAndNamespaces() throws Exception { + Map xpathConfig = createXPathConfig( + "//book:author/text()", "author_field", + "//book:title/@lang", "language_field" + ); + Map namespacesConfig = createNamespaceConfig( + "book", "http://example.com/book" + ); + Map config = createConfigWithXPathAndNamespaces(DEFAULT_FIELD, xpathConfig, namespacesConfig); + + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getNamespaces(), equalTo(namespacesConfig)); + } + + // Tests for individual boolean options + + public void testCreateWithStoreXmlFalse() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "store_xml"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isStoreXml(), equalTo(false)); + } + + public void testCreateWithRemoveNamespaces() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "remove_namespaces"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + public void testCreateWithForceContent() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "force_content"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceContent(), equalTo(true)); + } + + public void testCreateWithForceArray() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "force_array"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isForceArray(), equalTo(true)); + } + + public void testCreateWithStrictParseOptions() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, "strict"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.getParseOptions(), equalTo("strict")); + assertThat(processor.isStrict(), equalTo(true)); + } + + public void testCreateWithMultipleOptions() throws Exception { + Map config = createConfigWithOptions(DEFAULT_FIELD, + "ignore_missing", "force_content", "force_array", "remove_namespaces"); + XmlProcessor processor = createProcessor(config); + + assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); + assertThat(processor.isIgnoreMissing(), equalTo(true)); + assertThat(processor.isForceContent(), equalTo(true)); + assertThat(processor.isForceArray(), equalTo(true)); + assertThat(processor.isRemoveNamespaces(), equalTo(true)); + } + + // Tests for invalid parse options + + public void testCreateWithInvalidParseOptions() throws Exception { + Map config = createBaseConfig(); + config.put("parse_options", "invalid_option"); + + expectCreationFailure(config, IllegalArgumentException.class, "Invalid parse_options [invalid_option]. Only 'strict' is supported."); + } + + // Tests for XPath compilation errors (testing precompilation feature) + + public void testCreateWithInvalidXPathExpression() throws Exception { + Map xpathConfig = createXPathConfig("invalid xpath ][", "target_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + XmlProcessor.Factory factory = createFactory(); + String processorTag = randomAlphaOfLength(10); + + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> factory.create(null, processorTag, null, config, null) + ); + + // Check that the error message contains the XPath expression and indicates it's invalid + assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]:")); + assertThat(exception.getMessage(), containsString("javax.xml.transform.TransformerException")); + } + + public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exception { + Map xpathConfig = createXPathConfig("//book:title/text()", "title_field"); + Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); + + expectCreationFailure(config, IllegalArgumentException.class, "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided"); + } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index 0b1ee4123f128..e48c6de593f84 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -10,617 +10,627 @@ package org.elasticsearch.ingest.common; import org.elasticsearch.ingest.IngestDocument; -import org.elasticsearch.ingest.RandomDocumentPicks; import org.elasticsearch.test.ESTestCase; +import static org.hamcrest.Matchers.equalTo; import java.util.HashMap; -import java.util.List; import java.util.Map; +import java.util.List; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.equalTo; - +/** + * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. + */ +@SuppressWarnings("unchecked") public class XmlProcessorTests extends ESTestCase { - public void testSimpleXmlDecodeWithTargetField() throws Exception { - String xml = """ - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "xml", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map xmlField = (Map) ingestDocument.getFieldValue("xml", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) xmlField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); + private static final String XML_FIELD = "xmldata"; + private static final String TARGET_FIELD = "data"; - assertThat(book.get("seq"), equalTo("1")); - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("title"), equalTo("The Recognitions")); - assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); - - // Original field should remain unchanged - assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(xml)); + private static IngestDocument createTestIngestDocument(String xml) { + return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); } - - public void testXmlDecodeToSameFieldWhenTargetIsField() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - assertThat(book.get("seq"), equalTo("1")); - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("title"), equalTo("The Recognitions")); - assertThat(book.get("review"), equalTo("One of the great seminal American novels of the 20th century.")); + + private static XmlProcessor createTestProcessor(Map config) { + config.putIfAbsent("field", XML_FIELD); + config.putIfAbsent("target_field", TARGET_FIELD); + + XmlProcessor.Factory factory = new XmlProcessor.Factory(); + try { + return factory.create(null, "_tag", null, config, null); + } catch (Exception e){ + fail("Failed to create XmlProcessor: " + e.getMessage()); + return null; // This line will never be reached, but is needed to satisfy the compiler + } } - public void testXmlDecodeWithArray() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - - Ralls, Kim - Midnight Rain - Some review. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - List> books = (List>) catalog.get("book"); + /** + * Test parsing standard XML with attributes. + */ + public void testParseStandardXml() { + String xml = ""; - assertThat(books.size(), equalTo(2)); - - Map firstBook = books.get(0); - assertThat(firstBook.get("author"), equalTo("William H. Gaddis")); - assertThat(firstBook.get("title"), equalTo("The Recognitions")); - - Map secondBook = books.get(1); - assertThat(secondBook.get("author"), equalTo("Ralls, Kim")); - assertThat(secondBook.get("title"), equalTo("Midnight Rain")); - } - - public void testXmlDecodeWithToLower() throws Exception { - String xml = """ - - - - N/A - - - N/A - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, true, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map auditbase = (Map) messageField.get("auditbase"); - @SuppressWarnings("unchecked") - Map contextcomponents = (Map) auditbase.get("contextcomponents"); - @SuppressWarnings("unchecked") - List> components = (List>) contextcomponents.get("component"); - - assertThat(components.size(), equalTo(2)); - assertThat(components.get(0).get("relyingparty"), equalTo("N/A")); - assertThat(components.get(1).get("primaryauth"), equalTo("N/A")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("key"), equalTo("value")); } - - public void testXmlDecodeWithMultipleElements() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - - Ralls, Kim - Midnight Rain - Some review. - - - - Ralls, Kim - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen of the world. - - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with array elements (multiple elements with same name). + */ + public void testParseXmlWithArrayValue() { + String xml = "value1value2"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - - // Check books array - @SuppressWarnings("unchecked") - List> books = (List>) catalog.get("book"); - assertThat(books.size(), equalTo(2)); - - // Check secondcategory - @SuppressWarnings("unchecked") - Map secondcategory = (Map) catalog.get("secondcategory"); - @SuppressWarnings("unchecked") - Map paper = (Map) secondcategory.get("paper"); - assertThat(paper.get("id"), equalTo("bk102")); - assertThat(paper.get("test2"), equalTo("Ralls, Kim")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + List keyValues = (List) foo.get("key"); + assertThat(keyValues.size(), equalTo(2)); + + // The values might be nested inside their own lists + Object firstValue = keyValues.get(0); + assertThat(firstValue, equalTo("value1")); + + Object secondValue = keyValues.get(1); + assertThat(secondValue, equalTo("value2")); } - - public void testXmlDecodeWithUtf16Encoding() throws Exception { - String xml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with nested elements. + */ + public void testParseXmlWithNestedElements() { + String xml = "value"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("title"), equalTo("The Recognitions")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + Map key1Map = (Map) foo.get("key1"); + assertThat(key1Map.size(), equalTo(1)); + + String key2Value = (String) key1Map.get("key2"); + assertThat(key2Value, equalTo("value")); } - public void testBrokenXmlWithIgnoreFailureFalse() { - String brokenXml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - catalog>"""; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", brokenXml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("contains invalid XML")); - } + /** + * Test parsing XML in a single item array. + */ + public void testParseXmlInSingleItemArray() { + String xml = ""; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); - public void testBrokenXmlWithIgnoreFailureTrue() throws Exception { - String brokenXml = """ - - - - William H. Gaddis - The Recognitions - One of the great seminal American novels of the 20th century. - - catalog>"""; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, true, false, false); - Map document = new HashMap<>(); - document.put("message", brokenXml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - - // Should not throw exception and leave document unchanged processor.execute(ingestDocument); - assertThat(ingestDocument.getFieldValue("message", String.class), equalTo(brokenXml)); - } - - public void testFieldNotFound() { - XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", false, false, false, false); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); - - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("not present as part of path [nonexistent]")); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("bar"), equalTo("baz")); } - public void testFieldNotFoundWithIgnoreMissing() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "nonexistent", "target", true, false, false, false); - IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); - IngestDocument ingestDocument = new IngestDocument(originalDocument); + /** + * Test extracting a single element using XPath. + */ + public void testXPathSingleElementExtraction() { + String xml = "helloworld"; + + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Document should remain unchanged - assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + + // Get the XPath result + Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); + assertNotNull(barContent); + assertEquals("hello", barContent); + + // Verify that the full parsed XML is also available + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + assertNotNull(foo); + assertThat(foo.get("bar"), equalTo("hello")); + assertThat(foo.get("baz"), equalTo("world")); } - public void testNullValue() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", null); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test extracting multiple elements using XPath. + */ + public void testXPathMultipleElementsExtraction() { + String xml = "firstsecondthird"; + + Map xpathMap = Map.of("/foo/bar", "all_bars"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("field [field] is null")); + processor.execute(ingestDocument); + + List allBars = ingestDocument.getFieldValue("all_bars", List.class); + + assertNotNull(allBars); + assertThat(allBars.size(), equalTo(3)); + assertThat(allBars.get(0), equalTo("first")); + assertThat(allBars.get(1), equalTo("second")); + assertThat(allBars.get(2), equalTo("third")); } - public void testNullValueWithIgnoreMissing() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, false, false, false); - Map document = new HashMap<>(); - document.put("field", null); - IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - IngestDocument ingestDocument = new IngestDocument(originalDocument); + /** + * Test extracting attributes using XPath. + */ + public void testXPathAttributeExtraction() { + String xml = "content"; + + Map xpathMap = new HashMap<>(); + xpathMap.put("/foo/bar/@id", "bar_id"); + xpathMap.put("/foo/bar/@type", "bar_type"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Document should remain unchanged - assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + + String barId = ingestDocument.getFieldValue("bar_id", String.class); + assertNotNull(barId); + assertThat(barId, equalTo("123")); + + String barType = ingestDocument.getFieldValue("bar_type", String.class); + assertNotNull(barType); + assertThat(barType, equalTo("test")); } - public void testNonStringValueWithIgnoreFailureFalse() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", 123); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test extracting elements with namespaces using XPath. + */ + public void testXPathNamespacedExtraction() { + String xml = "" + + "" + + " namespace-value" + + " regular-value" + + ""; + + Map namespaces = Map.of("myns", "http://example.org/ns1"); + Map xpathMap = Map.of("//myns:element/text()", "ns_value"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + config.put("namespaces", namespaces); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); - Exception exception = expectThrows(IllegalArgumentException.class, () -> processor.execute(ingestDocument)); - assertThat(exception.getMessage(), containsString("field [field] is not a string")); + processor.execute(ingestDocument); + + String nsValue = ingestDocument.getFieldValue("ns_value", String.class); + assertNotNull(nsValue); + assertThat(nsValue, equalTo("namespace-value")); } - public void testNonStringValueWithIgnoreFailureTrue() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, true, false, false); - Map document = new HashMap<>(); - document.put("field", 123); - IngestDocument originalDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - IngestDocument ingestDocument = new IngestDocument(originalDocument); + /** + * Test parsing XML with mixed content (text and elements mixed together). + */ + public void testParseXmlWithMixedContent() { + String xml = "This text is bold and this is italic!"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Document should remain unchanged - assertThat(ingestDocument.getSourceAndMetadata(), equalTo(originalDocument.getSourceAndMetadata())); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + assertNotNull(foo.get("b")); + assertThat((String)foo.get("b"), equalTo("bold")); + assertNotNull(foo.get("i")); + assertThat((String)foo.get("i"), equalTo("italic")); + assertNotNull(foo.get("#text")); + assertThat((String)foo.get("#text"), equalTo("This text is and this is !")); } - - public void testEmptyXml() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", ""); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with CDATA sections. + */ + public void testParseXmlWithCDATA() { + String xml = " that shouldn't be parsed!]]>"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Empty XML should result in null target - assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Object content = data.get("foo"); + + assertNotNull(content); + assertThat(content, equalTo("This is CDATA content with that shouldn't be parsed!")); } - - public void testWhitespaceOnlyXml() throws Exception { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", " \n\t "); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + + /** + * Test parsing XML with numeric data. + */ + public void testParseXmlWithNumericData() { + String xml = "12399.95true"; + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - // Whitespace-only XML should result in null target - assertThat(ingestDocument.getFieldValue("target", Object.class), equalTo(null)); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + assertThat((String)foo.get("count"), equalTo("123")); + assertThat((String)foo.get("price"), equalTo("99.95")); + assertThat((String)foo.get("active"), equalTo("true")); } - public void testSelfClosingTag() throws Exception { - String xml = ""; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test parsing XML with force_array option enabled. + */ + public void testParseXmlWithForceArray() { + String xml = "single_value"; + + Map config = new HashMap<>(); + config.put("force_array", true); // Enable force_array option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - assertThat(result.get("empty"), equalTo(null)); + + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); + Map foo = (Map) data.get("foo"); + + // With force_array=true, even single values should be in arrays + Object barValue = foo.get("bar"); + assertNotNull(barValue); + assertTrue("Expected bar value to be a List with force_array=true", barValue instanceof List); + + List barList = (List) barValue; + assertThat(barList.size(), equalTo(1)); + assertThat(barList.get(0), equalTo("single_value")); } - public void testSelfClosingTagWithAttributes() throws Exception { - String xml = ""; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); + /** + * Test extracting multiple elements using multiple XPath expressions. + * Tests that multiple XPath expressions can be used simultaneously. + */ + public void testMultipleXPathExpressions() { + String xml = "" + + " John30" + + " Jane25" + + ""; + + // Configure multiple XPath expressions + Map xpathMap = new HashMap<>(); + xpathMap.put("/root/person[1]/n/text()", "first_person_name"); + xpathMap.put("/root/person[2]/n/text()", "second_person_name"); + xpathMap.put("/root/person/@id", "person_ids"); + + Map config = new HashMap<>(); + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - @SuppressWarnings("unchecked") - Map empty = (Map) result.get("empty"); - assertThat(empty.get("id"), equalTo("123")); - assertThat(empty.get("name"), equalTo("test")); + + assertTrue("first_person_name field should exist", ingestDocument.hasField("first_person_name")); + assertTrue("second_person_name field should exist", ingestDocument.hasField("second_person_name")); + assertTrue("person_ids field should exist", ingestDocument.hasField("person_ids")); + + Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); + assertEquals("John", firstName); + + Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); + assertEquals("Jane", secondName); + + Object personIdsObj = ingestDocument.getFieldValue("person_ids", Object.class); + assertTrue("person_ids should be a List", personIdsObj instanceof List); + List personIds = (List) personIdsObj; + assertEquals("Should have 2 person IDs", 2, personIds.size()); + assertEquals("First person ID should be '1'", "1", personIds.get(0)); + assertEquals("Second person ID should be '2'", "2", personIds.get(1)); + + assertTrue("Target field should exist", ingestDocument.hasField(TARGET_FIELD)); } - - public void testGetType() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - assertThat(processor.getType(), equalTo("xml")); + + /** + * Test handling of invalid XML with ignoreFailure=false. + */ + public void testInvalidXml() { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { + processor.execute(ingestDocument); + }); + + assertTrue("Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || + exception.getCause().getMessage().contains("XML")); } - - public void testGetters() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, false); - assertThat(processor.getField(), equalTo("field")); - assertThat(processor.getTargetField(), equalTo("target")); - assertThat(processor.isIgnoreMissing(), equalTo(true)); + + /** + * Test handling of invalid XML with ignoreFailure=true. + */ + public void testInvalidXmlWithIgnoreFailure() { + String xml = ""; // Invalid XML missing closing tag + + Map config = new HashMap<>(); + config.put("ignore_failure", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + List tags = ingestDocument.getFieldValue("tags", List.class); + assertNotNull(tags); + assertTrue(tags.contains("_xmlparsefailure")); } - - public void testIgnoreEmptyValueEnabled() throws Exception { - String xml = """ - - - William H. Gaddis - - One of the great seminal American novels. - - - - Some content - - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test the store_xml=false option to not store parsed XML in target field. + */ + public void testNoStoreXml() { + String xml = "value"; + + // Set up XPath to extract value but don't store XML + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); + + Map config = new HashMap<>(); + config.put("store_xml", false); // Do not store XML in target field + config.put("xpath", xpathMap); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - // Empty title should be filtered out - assertThat(book.containsKey("title"), equalTo(false)); - // Empty element should be filtered out - assertThat(book.containsKey("empty"), equalTo(false)); - // empty_book should be filtered out entirely - assertThat(catalog.containsKey("empty_book"), equalTo(false)); - - // Valid content should remain - assertThat(book.get("author"), equalTo("William H. Gaddis")); - assertThat(book.get("review"), equalTo("One of the great seminal American novels.")); - - // Nested structure handling - @SuppressWarnings("unchecked") - Map nested = (Map) book.get("nested"); - assertThat(nested.containsKey("empty_text"), equalTo(false)); // Whitespace-only should be filtered - assertThat(nested.get("valid_content"), equalTo("Some content")); + + // Verify XPath result is stored + String barContent = ingestDocument.getFieldValue("bar_content", String.class); + assertNotNull(barContent); + assertThat(barContent, equalTo("value")); + + // Verify the target field was not created + assertFalse(ingestDocument.hasField(TARGET_FIELD)); } - - public void testIgnoreEmptyValueWithArrays() throws Exception { - String xml = """ - - - Valid Book - - - - - - Another Valid Book - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, true); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test the to_lower option for converting field names to lowercase. + */ + public void testToLower() { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("to_lower", true); // Enable to_lower option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - List> books = (List>) catalog.get("book"); - - // Should have 2 books after filtering out the one with empty title - assertThat(books.size(), equalTo(2)); - assertThat(books.get(0).get("title"), equalTo("Valid Book")); - assertThat(books.get(1).get("title"), equalTo("Another Valid Book")); + + // Verify field names are lowercase + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + assertTrue(data.containsKey("foo")); + assertFalse(data.containsKey("FOO")); + + Map foo = (Map) data.get("foo"); + assertTrue(foo.containsKey("bar")); + assertFalse(foo.containsKey("BAR")); + assertThat(foo.get("bar"), equalTo("value")); } - - public void testIgnoreEmptyValueDisabled() throws Exception { - String xml = """ - - - William H. Gaddis - - - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "message", "message", false, false, false, false); - Map document = new HashMap<>(); - document.put("message", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test the ignore_missing option when field is missing. + */ + public void testIgnoreMissing() { + String xmlField = "nonexistent_field"; + + Map config = new HashMap<>(); + config.put("field", xmlField); + config.put("ignore_missing", true); // Enable ignore_missing option + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map messageField = (Map) ingestDocument.getFieldValue("message", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) messageField.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - - // Empty values should remain when ignore_empty_value is false - assertThat(book.containsKey("title"), equalTo(true)); - assertThat(book.get("title"), equalTo(null)); // Empty elements are parsed as null - assertThat(book.containsKey("empty"), equalTo(true)); - assertThat(book.get("empty"), equalTo(null)); - assertThat(book.get("author"), equalTo("William H. Gaddis")); + + assertFalse("Target field should not be created when source field is missing", + ingestDocument.hasField(TARGET_FIELD)); + + // With ignoreMissing=false + config.put("ignore_missing", false); + XmlProcessor failingProcessor = createTestProcessor(config); + + // This should throw an exception + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { + failingProcessor.execute(ingestDocument); + }); + + assertTrue(exception.getMessage().contains("not present as part of path")); } - - public void testGettersWithIgnoreEmptyValue() { - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", true, true, true, true); - assertThat(processor.getField(), equalTo("field")); - assertThat(processor.getTargetField(), equalTo("target")); - assertThat(processor.isIgnoreMissing(), equalTo(true)); - assertThat(processor.isIgnoreEmptyValue(), equalTo(true)); + + /** + * Test that ignore_empty_value correctly filters out empty values from arrays and mixed content. + */ + public void testIgnoreEmptyValue() { + // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values + String xml = "" + + " " + + " " + + " content" + + " nested-content" + + " " + + " first" + + " " + + " third" + + " " + + " fifth" + + " " + + " Text with and content" + + ""; + + Map config = new HashMap<>(); + config.put("ignore_empty_value", true); + XmlProcessor processor = createTestProcessor(config); + + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); + + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map root = (Map) result.get("root"); + + // Check empty elements are filtered + assertFalse("Empty element should be filtered out", root.containsKey("empty")); + assertFalse("Blank element should be filtered out", root.containsKey("blank")); + + // Check valid elements are preserved + assertTrue("Valid element should be preserved", root.containsKey("valid")); + assertEquals("content", root.get("valid")); + + // Check nested structure filtering + Map nested = (Map) root.get("nested"); + assertNotNull("Nested element should be preserved", nested); + assertFalse("Empty nested element should be filtered", nested.containsKey("empty")); + assertEquals("nested-content", nested.get("valid")); + + // Check array with mixed empty/non-empty values + Map items = (Map) root.get("items"); + assertNotNull("Items object should be preserved", items); + List itemList = (List) items.get("item"); + assertNotNull("Item array should be preserved", itemList); + assertEquals("Array should contain only non-empty items", 3, itemList.size()); + assertEquals("first", itemList.get(0)); + assertEquals("third", itemList.get(1)); + assertEquals("fifth", itemList.get(2)); + + // Check mixed content handling + Map mixed = (Map) root.get("mixed"); + assertNotNull("Mixed content should be preserved", mixed); + assertFalse("Empty element in mixed content should be filtered", mixed.containsKey("empty")); + assertTrue("Valid element in mixed content should be preserved", mixed.containsKey("valid")); + assertEquals("content", mixed.get("valid")); + assertEquals("Text with and", mixed.get("#text")); } - - public void testElementsWithAttributesAndTextContent() throws Exception { - String xml = """ - - - The Recognitions - William H. Gaddis - 29.99 - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, false, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test parsing with strict mode option. + */ + public void testStrictParsing() { + String xml = "valid"; + + Map config = new HashMap<>(); + config.put("parse_options", "strict"); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); - - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) result.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - @SuppressWarnings("unchecked") - Map title = (Map) book.get("title"); - @SuppressWarnings("unchecked") - Map author = (Map) book.get("author"); - @SuppressWarnings("unchecked") - Map price = (Map) book.get("price"); - - // Test catalog with attributes only - assertThat(catalog.get("version"), equalTo("1.0")); - - // Test book with attributes only (no text content) - assertThat(book.get("id"), equalTo("123")); - assertThat(book.get("isbn"), equalTo("978-0-684-80335-9")); - - // Test elements with both attributes and text content (should use #text key) - assertThat(title.get("lang"), equalTo("en")); - assertThat(title.get("#text"), equalTo("The Recognitions")); - - assertThat(author.get("nationality"), equalTo("American")); - assertThat(author.get("#text"), equalTo("William H. Gaddis")); - - assertThat(price.get("currency"), equalTo("USD")); - assertThat(price.get("#text"), equalTo("29.99")); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + assertThat(foo.get("bar"), equalTo("valid")); + + // Test with invalid XML in strict mode + String invalidXml = ""; + IngestDocument invalidDocument = createTestIngestDocument(invalidXml); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { + processor.execute(invalidDocument); + }); + + assertTrue("Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || + exception.getCause().getMessage().contains("XML")); } - - public void testMixedAttributesAndTextWithToLower() throws Exception { - String xml = """ - - - The Recognitions - William H. Gaddis - - """; - - XmlProcessor processor = new XmlProcessor("tag", null, "field", "target", false, false, true, false); - Map document = new HashMap<>(); - document.put("field", xml); - IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); - + + /** + * Test parsing XML with remove_namespaces option. + */ + public void testRemoveNamespaces() { + String xml = "value"; + + Map config = new HashMap<>(); + config.put("remove_namespaces", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + + assertTrue("Element with namespace should be present", foo.containsKey("ns:bar")); + assertThat(foo.get("ns:bar"), equalTo("value")); + + // Now test with removeNamespaces=false + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + config.put("remove_namespaces", false); + XmlProcessor processor2 = createTestProcessor(config); + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + Map foo2 = (Map) data2.get("foo"); + + // With removeNamespaces=false, the "ns:" prefix should be preserved + assertTrue("Element should be accessible with namespace prefix", foo2.containsKey("ns:bar")); + assertThat(foo2.get("ns:bar"), equalTo("value")); + } - @SuppressWarnings("unchecked") - Map result = (Map) ingestDocument.getFieldValue("target", Object.class); - @SuppressWarnings("unchecked") - Map catalog = (Map) result.get("catalog"); - @SuppressWarnings("unchecked") - Map book = (Map) catalog.get("book"); - @SuppressWarnings("unchecked") - Map title = (Map) book.get("title"); - @SuppressWarnings("unchecked") - Map author = (Map) book.get("author"); - - // Test that element names are converted to lowercase - assertThat(catalog.get("version"), equalTo("1.0")); - assertThat(book.get("id"), equalTo("123")); - - // Test that attribute names are converted to lowercase but values remain unchanged - assertThat(title.get("lang"), equalTo("EN")); - assertThat(title.get("#text"), equalTo("The Recognitions")); - - assertThat(author.get("nationality"), equalTo("AMERICAN")); - assertThat(author.get("#text"), equalTo("William H. Gaddis")); + /** + * Test the force_content option. + */ + public void testForceContent() { + String xml = "simple text"; + + Map config = new HashMap<>(); + config.put("force_content", true); + XmlProcessor processor = createTestProcessor(config); + IngestDocument ingestDocument = createTestIngestDocument(xml); + + processor.execute(ingestDocument); + + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); + Map foo = (Map) data.get("foo"); + + // With forceContent=true, the text should be in a #text field + assertTrue("Text content should be in #text field", foo.containsKey("#text")); + assertThat(foo.get("#text"), equalTo("simple text")); + + // Now test with forceContent=false + config.put("force_content", false); + XmlProcessor processor2 = createTestProcessor(config); + IngestDocument ingestDocument2 = createTestIngestDocument(xml); + + processor2.execute(ingestDocument2); + + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); + + // With forceContent=false, the text should be directly assigned to the element + assertThat(data2.get("foo"), equalTo("simple text")); } } From 002802d1af34999b8c6d84bdaacd0e5c38d16fec Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 4 Jul 2025 14:09:32 +0000 Subject: [PATCH 7/9] [CI] Auto commit changes from spotless --- .../ingest/common/XmlProcessor.java | 213 ++++++------ .../common/XmlProcessorFactoryTests.java | 84 ++--- .../ingest/common/XmlProcessorTests.java | 313 +++++++++--------- 3 files changed, 322 insertions(+), 288 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 582b8d9f3e787..2bcb8666dba6b 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -14,6 +14,9 @@ import org.elasticsearch.ingest.ConfigurationUtils; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; import java.util.ArrayList; import java.util.HashMap; @@ -31,10 +34,6 @@ import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - /** * Processor that parses XML documents and converts them to JSON objects using a single-pass streaming approach. * @@ -49,15 +48,16 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; - + private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); - + // Pre-configured SAX parser factories for secure XML parsing private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_STRICT = createSecureSaxParserFactoryStrict(); - private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = createSecureSaxParserFactoryNamespaceAwareStrict(); - + private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS_STRICT = + createSecureSaxParserFactoryNamespaceAwareStrict(); + // Pre-configured document builder factory for DOM creation private static final DocumentBuilderFactory DOM_FACTORY = createSecureDocumentBuilderFactory(); @@ -202,13 +202,13 @@ public String getType() { /** * Determines if a value should be considered empty for filtering purposes. * Used by the ignore_empty_value feature to filter out empty content. - * + * * Considers empty: * - null values * - empty or whitespace-only strings * - empty Maps * - empty Lists - * + * * @param value the value to check * @return true if the value should be considered empty */ @@ -235,7 +235,7 @@ private boolean isEmptyValue(Object value) { * - ATTRIBUTE_NODE: returns attribute value * - ELEMENT_NODE: returns text content (concatenated text of all descendants) * - Other node types: returns text content as fallback - * + * * @param node the DOM node to extract text from * @return the text content of the node, or null if node is null */ @@ -243,7 +243,7 @@ private String getNodeValue(Node node) { if (node == null) { return null; } - + switch (node.getNodeType()) { case Node.ATTRIBUTE_NODE: case Node.CDATA_SECTION_NODE: @@ -263,7 +263,7 @@ private String getNodeValue(Node node) { * - If force_array is true and content is already a List: returns content unchanged * - If force_array is true and content is not a List: wraps content in a new ArrayList * - Handles null content appropriately (wraps null in array if force_array is true) - * + * * @param elementName the name of the element (for context, not used in current implementation) * @param content the content to potentially wrap in an array * @return the content, optionally wrapped in an array based on force_array setting @@ -279,13 +279,13 @@ private Object applyForceArray(String elementName, Object content) { /** * Evaluates precompiled XPath expressions against a DOM document and adds results to the ingest document. - * + * * Features: * - Uses precompiled XPath expressions for optimal performance * - Extracts text values from matched nodes (elements, attributes, text nodes) * - Single matches stored as strings, multiple matches as string arrays * - Respects ignoreFailure setting for XPath evaluation errors - * + * * @param document the ingest document to add XPath results to * @param doc the DOM document to evaluate XPath expressions against * @throws Exception if XPath processing fails and ignoreFailure is false @@ -295,14 +295,14 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do for (Map.Entry entry : compiledXPathExpressions.entrySet()) { String targetFieldName = entry.getKey(); XPathExpression compiledExpression = entry.getValue(); - + try { Object result = compiledExpression.evaluate(doc, XPathConstants.NODESET); - + if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; List values = new ArrayList<>(); - + for (int i = 0; i < nodeList.getLength(); i++) { Node node = nodeList.item(i); String value = getNodeValue(node); @@ -310,7 +310,7 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do values.add(value); } } - + if (values.isEmpty() == false) { if (values.size() == 1) { document.setFieldValue(targetFieldName, values.get(0)); @@ -321,7 +321,10 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do } } catch (XPathExpressionException e) { if (ignoreFailure == false) { - throw new IllegalArgumentException("XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), e); + throw new IllegalArgumentException( + "XPath evaluation failed for target field [" + targetFieldName + "]: " + e.getMessage(), + e + ); } } } @@ -331,23 +334,23 @@ private void processXPathExpressionsFromDom(IngestDocument document, Document do * Compiles XPath expressions at processor creation time for optimal runtime performance. * This method pre-compiles all configured XPath expressions with appropriate namespace context, * eliminating the compilation overhead during document processing. - * + * * @param xpathExpressions map of XPath expressions to target field names * @param namespaces map of namespace prefixes to URIs * @return map of compiled XPath expressions keyed by target field name * @throws IllegalArgumentException if XPath compilation fails or namespace validation fails */ private static Map compileXPathExpressions( - Map xpathExpressions, + Map xpathExpressions, Map namespaces ) { if (xpathExpressions.isEmpty()) { return Map.of(); } - + Map compiled = new HashMap<>(); XPath xpath = XPATH_FACTORY.newXPath(); - + // Set namespace context if namespaces are defined boolean hasNamespaces = namespaces.isEmpty() == false; if (hasNamespaces) { @@ -382,32 +385,31 @@ public Iterator getPrefixes(String namespaceURI) { } }); } - + // Pre-compiled pattern to detect namespace prefixes - java.util.regex.Pattern namespacePattern = - java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); - + java.util.regex.Pattern namespacePattern = java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); - + // Validate namespace prefixes if no namespaces are configured if (!hasNamespaces && namespacePattern.matcher(xpathExpression).matches()) { throw new IllegalArgumentException( - "Invalid XPath expression [" + xpathExpression + "]: contains namespace prefixes but no namespace configuration provided" + "Invalid XPath expression [" + + xpathExpression + + "]: contains namespace prefixes but no namespace configuration provided" ); } - + try { XPathExpression compiledExpression = xpath.compile(xpathExpression); compiled.put(targetFieldName, compiledExpression); } catch (XPathExpressionException e) { - throw new IllegalArgumentException( - "Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e - ); + throw new IllegalArgumentException("Invalid XPath expression [" + xpathExpression + "]: " + e.getMessage(), e); } } - + return Map.copyOf(compiled); } @@ -431,7 +433,7 @@ public XmlProcessor create( boolean removeNamespaces = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "remove_namespaces", false); boolean forceContent = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_content", false); boolean forceArray = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "force_array", false); - + // Parse XPath expressions map Map xpathExpressions = new HashMap<>(); Object xpathConfig = config.get("xpath"); @@ -444,7 +446,11 @@ public XmlProcessor create( xpathExpressions.put(entry.getKey(), (String) entry.getValue()); } else { throw new IllegalArgumentException( - "XPath target field [" + entry.getKey() + "] must be a string, got [" + entry.getValue().getClass().getSimpleName() + "]" + "XPath target field [" + + entry.getKey() + + "] must be a string, got [" + + entry.getValue().getClass().getSimpleName() + + "]" ); } } @@ -465,7 +471,11 @@ public XmlProcessor create( namespaces.put(entry.getKey(), (String) entry.getValue()); } else { throw new IllegalArgumentException( - "Namespace prefix [" + entry.getKey() + "] must have a string URI, got [" + entry.getValue().getClass().getSimpleName() + "]" + "Namespace prefix [" + + entry.getKey() + + "] must have a string URI, got [" + + entry.getValue().getClass().getSimpleName() + + "]" ); } } @@ -480,14 +490,30 @@ public XmlProcessor create( throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); } - return new XmlProcessor(processorTag, description, field, targetField, ignoreMissing, ignoreFailure, toLower, ignoreEmptyValue, storeXml, removeNamespaces, forceContent, forceArray, xpathExpressions, namespaces, parseOptions); + return new XmlProcessor( + processorTag, + description, + field, + targetField, + ignoreMissing, + ignoreFailure, + toLower, + ignoreEmptyValue, + storeXml, + removeNamespaces, + forceContent, + forceArray, + xpathExpressions, + namespaces, + parseOptions + ); } } /** * Main XML parsing method that converts XML to JSON and optionally extracts XPath values. * Uses streaming SAX parser with optional DOM building for XPath processing. - * + * * @param document the ingest document to modify with parsed results * @param xmlString the XML string to parse (should be trimmed) * @throws Exception if XML parsing fails @@ -499,12 +525,12 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws // Determine if we need DOM for XPath processing boolean needsDom = xpathExpressions.isEmpty() == false; - + // Use the appropriate pre-configured SAX parser factory javax.xml.parsers.SAXParserFactory factory = selectSaxParserFactory(); - + javax.xml.parsers.SAXParser parser = factory.newSAXParser(); - + // Configure error handler for strict mode if (isStrict()) { parser.getXMLReader().setErrorHandler(new org.xml.sax.ErrorHandler() { @@ -512,24 +538,24 @@ private void parseXmlAndXPath(IngestDocument document, String xmlString) throws public void warning(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { throw exception; } - + @Override public void error(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { throw exception; } - + @Override public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.sax.SAXException { throw exception; } }); } - + // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes("UTF-8")), handler); - + // Store structured result if needed if (storeXml) { Object streamingResult = handler.getStructuredResult(); @@ -537,7 +563,7 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s document.setFieldValue(targetField, streamingResult); } } - + // Process XPath expressions if DOM was built during streaming if (needsDom) { Document domDocument = handler.getDomDocument(); @@ -558,12 +584,12 @@ private class XmlStreamingWithDomHandler extends org.xml.sax.helpers.DefaultHand private final java.util.Deque textStack = new java.util.ArrayDeque<>(); private final java.util.Deque>> repeatedElementsStack = new java.util.ArrayDeque<>(); private Object rootResult = null; - + // DOM building state (for XPath processing when needed) private final boolean buildDom; private Document domDocument = null; private final java.util.Deque domElementStack = new java.util.ArrayDeque<>(); - + public XmlStreamingWithDomHandler(boolean buildDom) { this.buildDom = buildDom; } @@ -585,29 +611,30 @@ public void startDocument() throws org.xml.sax.SAXException { } @Override - public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException { + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) + throws org.xml.sax.SAXException { String elementName = getElementName(uri, localName, qName); - + // Build structured representation (always) Map element = new HashMap<>(); Map> repeatedElements = new HashMap<>(); - + // Process attributes for structured output for (int i = 0; i < attributes.getLength(); i++) { String attrName = getAttributeName(attributes.getURI(i), attributes.getLocalName(i), attributes.getQName(i)); String attrValue = attributes.getValue(i); - + // Apply ignoreEmptyValue filtering to attributes if (ignoreEmptyValue == false || isEmptyValue(attrValue) == false) { element.put(attrName, attrValue); } } - + elementStack.push(element); elementNameStack.push(elementName); textStack.push(new StringBuilder()); repeatedElementsStack.push(repeatedElements); - + // Build DOM element simultaneously if needed if (buildDom && domDocument != null) { org.w3c.dom.Element domElement; @@ -616,28 +643,28 @@ public void startElement(String uri, String localName, String qName, org.xml.sax } else { domElement = domDocument.createElement(removeNamespaces ? localName : qName); } - + // Add attributes to DOM element for (int i = 0; i < attributes.getLength(); i++) { String attrUri = attributes.getURI(i); String attrLocalName = attributes.getLocalName(i); String attrQName = attributes.getQName(i); String attrValue = attributes.getValue(i); - + if (attrUri != null && !attrUri.isEmpty() && !removeNamespaces) { domElement.setAttributeNS(attrUri, attrQName, attrValue); } else { domElement.setAttribute(removeNamespaces ? attrLocalName : attrQName, attrValue); } } - + // Add to parent or root if (domElementStack.isEmpty()) { domDocument.appendChild(domElement); } else { domElementStack.peek().appendChild(domElement); } - + domElementStack.push(domElement); } } @@ -648,7 +675,7 @@ public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXE if (!textStack.isEmpty()) { textStack.peek().append(ch, start, length); } - + // Add to DOM text node if needed if (buildDom && !domElementStack.isEmpty()) { String text = new String(ch, start, length); @@ -665,12 +692,12 @@ public void endElement(String uri, String localName, String qName) throws org.xm if (elementStack.isEmpty()) { return; } - + Map element = elementStack.pop(); String elementName = elementNameStack.pop(); StringBuilder textContent = textStack.pop(); Map> repeatedElements = repeatedElementsStack.pop(); - + // Add repeated elements as arrays for (Map.Entry> entry : repeatedElements.entrySet()) { List values = entry.getValue(); @@ -678,12 +705,12 @@ public void endElement(String uri, String localName, String qName) throws org.xm element.put(entry.getKey(), values); } } - + // Process text content and determine final element structure String trimmedText = textContent.toString().trim(); boolean hasText = trimmedText.isEmpty() == false; boolean hasChildren = element.size() > 0; - + Object elementValue; if (hasText == false && hasChildren == false) { // Empty element @@ -718,7 +745,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm } elementValue = (forceArray && forceContent) ? applyForceArray(elementName, element) : element; } - + // If this is the root element, store the result if (elementStack.isEmpty()) { if (elementValue != null) { @@ -731,7 +758,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm if (elementValue != null) { Map parentElement = elementStack.peek(); Map> parentRepeatedElements = repeatedElementsStack.peek(); - + if (parentElement.containsKey(elementName) || parentRepeatedElements.containsKey(elementName)) { // Handle repeated elements if (parentRepeatedElements.containsKey(elementName) == false) { @@ -748,7 +775,7 @@ public void endElement(String uri, String localName, String qName) throws org.xm } } } - + // Complete DOM element if building DOM if (buildDom && !domElementStack.isEmpty()) { domElementStack.pop(); @@ -763,7 +790,7 @@ public void endDocument() throws org.xml.sax.SAXException { public Object getStructuredResult() { return rootResult; } - + public Document getDomDocument() { return domDocument; } @@ -775,12 +802,12 @@ private String getElementName(String uri, String localName, String qName) { } else { elementName = qName; } - + // Apply toLower if enabled if (toLower) { elementName = elementName.toLowerCase(Locale.ROOT); } - + return elementName; } @@ -791,16 +818,16 @@ private String getAttributeName(String uri, String localName, String qName) { } else { attrName = qName; } - + // Apply toLower if enabled if (toLower) { attrName = attrName.toLowerCase(Locale.ROOT); } - + return attrName; } } - + /** * Creates a secure, pre-configured SAX parser factory for XML parsing. * This factory is configured to prevent XXE attacks with SAX-specific features. @@ -808,7 +835,7 @@ private String getAttributeName(String uri, String localName, String qName) { private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() { javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -819,10 +846,10 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactory() } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured namespace-aware SAX parser factory for XML parsing. * This factory is configured to prevent XXE attacks and has namespace awareness enabled. @@ -831,7 +858,7 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -842,10 +869,10 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured SAX parser factory for strict XML parsing. * This factory is configured to prevent XXE attacks and has strict validation enabled. @@ -853,7 +880,7 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryStrict() { javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -861,16 +888,16 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactorySt factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - + // Enable strict parsing features factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured namespace-aware SAX parser factory for strict XML parsing. * This factory is configured to prevent XXE attacks, has namespace awareness enabled, and strict validation. @@ -879,7 +906,7 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa javax.xml.parsers.SAXParserFactory factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); - + // Configure SAX-specific security features to prevent XXE attacks try { // SAX parser features - these are the correct features for SAXParserFactory @@ -887,16 +914,16 @@ private static javax.xml.parsers.SAXParserFactory createSecureSaxParserFactoryNa factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); - + // Enable strict parsing features factory.setFeature("http://apache.org/xml/features/validation/check-full-element-content", true); } catch (Exception e) { // If features cannot be set, continue with default settings } - + return factory; } - + /** * Creates a secure, pre-configured DocumentBuilderFactory for DOM creation. * Since we only use this factory to create empty DOM documents programmatically @@ -907,13 +934,13 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); // Enable for maximum compatibility factory.setValidating(false); - + // No XXE security features needed - we only create empty documents, // never parse XML with this factory - + return factory; } - + /** * Selects the appropriate pre-configured SAX parser factory based on processor configuration. * @@ -922,7 +949,7 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * - Regular parsing, with namespaces: SAX_PARSER_FACTORY_NS * - Strict parsing, no namespaces: SAX_PARSER_FACTORY_STRICT * - Strict parsing, with namespaces: SAX_PARSER_FACTORY_NS_STRICT - * + * * @return the appropriate SAX parser factory for the current configuration */ private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java index d0f4437074359..e693c39a5cab3 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorFactoryTests.java @@ -15,8 +15,8 @@ import java.util.HashMap; import java.util.Map; -import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; public class XmlProcessorFactoryTests extends ESTestCase { @@ -68,8 +68,8 @@ private Map createConfigWithNamespaces(String fieldName, Map createConfigWithXPathAndNamespaces( - String fieldName, - Map xpathExpressions, + String fieldName, + Map xpathExpressions, Map namespaces ) { Map config = createBaseConfig(fieldName); @@ -100,7 +100,7 @@ private Map createXPathConfig(String... expressionsAndFields) { if (expressionsAndFields.length % 2 != 0) { throw new IllegalArgumentException("Must provide even number of arguments (expression, field, expression, field, ...)"); } - + Map xpathConfig = new HashMap<>(); for (int i = 0; i < expressionsAndFields.length; i += 2) { xpathConfig.put(expressionsAndFields[i], expressionsAndFields[i + 1]); @@ -115,7 +115,7 @@ private Map createNamespaceConfig(String... prefixesAndUris) { if (prefixesAndUris.length % 2 != 0) { throw new IllegalArgumentException("Must provide even number of arguments (prefix, uri, prefix, uri, ...)"); } - + Map namespaceConfig = new HashMap<>(); for (int i = 0; i < prefixesAndUris.length; i += 2) { namespaceConfig.put(prefixesAndUris[i], prefixesAndUris[i + 1]); @@ -128,7 +128,7 @@ private Map createNamespaceConfig(String... prefixesAndUris) { */ private Map createConfigWithOptions(String fieldName, String... options) { Map config = createBaseConfig(fieldName); - + for (String option : options) { switch (option) { case "ignore_missing": @@ -162,7 +162,7 @@ private Map createConfigWithOptions(String fieldName, String... throw new IllegalArgumentException("Unknown option: " + option); } } - + return config; } @@ -172,11 +172,8 @@ private Map createConfigWithOptions(String fieldName, String... private void expectCreationFailure(Map config, Class exceptionClass, String expectedMessage) { XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - - Exception exception = expectThrows( - exceptionClass, - () -> factory.create(null, processorTag, null, config, null) - ); + + Exception exception = expectThrows(exceptionClass, () -> factory.create(null, processorTag, null, config, null)); assertThat(exception.getMessage(), equalTo(expectedMessage)); } @@ -226,10 +223,7 @@ public void testCreateWithIgnoreEmptyValueOnly() throws Exception { } public void testCreateWithXPath() throws Exception { - Map xpathConfig = createXPathConfig( - "//author/text()", "author_field", - "//title/@lang", "language_field" - ); + Map xpathConfig = createXPathConfig("//author/text()", "author_field", "//title/@lang", "language_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); XmlProcessor processor = createProcessor(config); @@ -240,24 +234,30 @@ public void testCreateWithXPath() throws Exception { public void testCreateWithInvalidXPathConfig() throws Exception { Map config = createBaseConfig(); config.put("xpath", "invalid_string"); // Should be a map - + expectCreationFailure(config, IllegalArgumentException.class, "XPath configuration must be a map of expressions to target fields"); } public void testCreateWithInvalidXPathTargetField() throws Exception { Map config = createBaseConfig(); - + Map xpathConfig = new HashMap<>(); xpathConfig.put("//author/text()", 123); // Should be string config.put("xpath", xpathConfig); - expectCreationFailure(config, IllegalArgumentException.class, "XPath target field [//author/text()] must be a string, got [Integer]"); + expectCreationFailure( + config, + IllegalArgumentException.class, + "XPath target field [//author/text()] must be a string, got [Integer]" + ); } public void testCreateWithNamespaces() throws Exception { Map namespacesConfig = createNamespaceConfig( - "book", "http://example.com/book", - "author", "http://example.com/author" + "book", + "http://example.com/book", + "author", + "http://example.com/author" ); Map config = createConfigWithNamespaces(DEFAULT_FIELD, namespacesConfig); @@ -276,7 +276,7 @@ public void testCreateWithInvalidNamespacesConfig() throws Exception { public void testCreateWithInvalidNamespaceURI() throws Exception { Map config = createBaseConfig(); - + Map namespacesConfig = new HashMap<>(); namespacesConfig.put("book", 123); // Should be string config.put("namespaces", namespacesConfig); @@ -285,13 +285,8 @@ public void testCreateWithInvalidNamespaceURI() throws Exception { } public void testCreateWithXPathAndNamespaces() throws Exception { - Map xpathConfig = createXPathConfig( - "//book:author/text()", "author_field", - "//book:title/@lang", "language_field" - ); - Map namespacesConfig = createNamespaceConfig( - "book", "http://example.com/book" - ); + Map xpathConfig = createXPathConfig("//book:author/text()", "author_field", "//book:title/@lang", "language_field"); + Map namespacesConfig = createNamespaceConfig("book", "http://example.com/book"); Map config = createConfigWithXPathAndNamespaces(DEFAULT_FIELD, xpathConfig, namespacesConfig); XmlProcessor processor = createProcessor(config); @@ -301,7 +296,7 @@ public void testCreateWithXPathAndNamespaces() throws Exception { } // Tests for individual boolean options - + public void testCreateWithStoreXmlFalse() throws Exception { Map config = createConfigWithOptions(DEFAULT_FIELD, "store_xml"); XmlProcessor processor = createProcessor(config); @@ -344,8 +339,13 @@ public void testCreateWithStrictParseOptions() throws Exception { } public void testCreateWithMultipleOptions() throws Exception { - Map config = createConfigWithOptions(DEFAULT_FIELD, - "ignore_missing", "force_content", "force_array", "remove_namespaces"); + Map config = createConfigWithOptions( + DEFAULT_FIELD, + "ignore_missing", + "force_content", + "force_array", + "remove_namespaces" + ); XmlProcessor processor = createProcessor(config); assertThat(processor.getField(), equalTo(DEFAULT_FIELD)); @@ -356,28 +356,32 @@ public void testCreateWithMultipleOptions() throws Exception { } // Tests for invalid parse options - + public void testCreateWithInvalidParseOptions() throws Exception { Map config = createBaseConfig(); config.put("parse_options", "invalid_option"); - expectCreationFailure(config, IllegalArgumentException.class, "Invalid parse_options [invalid_option]. Only 'strict' is supported."); + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid parse_options [invalid_option]. Only 'strict' is supported." + ); } // Tests for XPath compilation errors (testing precompilation feature) - + public void testCreateWithInvalidXPathExpression() throws Exception { Map xpathConfig = createXPathConfig("invalid xpath ][", "target_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); XmlProcessor.Factory factory = createFactory(); String processorTag = randomAlphaOfLength(10); - + IllegalArgumentException exception = expectThrows( IllegalArgumentException.class, () -> factory.create(null, processorTag, null, config, null) ); - + // Check that the error message contains the XPath expression and indicates it's invalid assertThat(exception.getMessage(), containsString("Invalid XPath expression [invalid xpath ][]:")); assertThat(exception.getMessage(), containsString("javax.xml.transform.TransformerException")); @@ -387,6 +391,10 @@ public void testCreateWithXPathUsingNamespacesWithoutConfiguration() throws Exce Map xpathConfig = createXPathConfig("//book:title/text()", "title_field"); Map config = createConfigWithXPath(DEFAULT_FIELD, xpathConfig); - expectCreationFailure(config, IllegalArgumentException.class, "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided"); + expectCreationFailure( + config, + IllegalArgumentException.class, + "Invalid XPath expression [//book:title/text()]: contains namespace prefixes but no namespace configuration provided" + ); } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index e48c6de593f84..f2c1aec1031d7 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -11,11 +11,12 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.test.ESTestCase; -import static org.hamcrest.Matchers.equalTo; import java.util.HashMap; -import java.util.Map; import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; /** * Tests for {@link XmlProcessor}. These tests ensure feature parity and test coverage. @@ -29,7 +30,7 @@ public class XmlProcessorTests extends ESTestCase { private static IngestDocument createTestIngestDocument(String xml) { return new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of(XML_FIELD, xml))); } - + private static XmlProcessor createTestProcessor(Map config) { config.putIfAbsent("field", XML_FIELD); config.putIfAbsent("target_field", TARGET_FIELD); @@ -37,7 +38,7 @@ private static XmlProcessor createTestProcessor(Map config) { XmlProcessor.Factory factory = new XmlProcessor.Factory(); try { return factory.create(null, "_tag", null, config, null); - } catch (Exception e){ + } catch (Exception e) { fail("Failed to create XmlProcessor: " + e.getMessage()); return null; // This line will never be reached, but is needed to satisfy the compiler } @@ -54,55 +55,55 @@ public void testParseStandardXml() { IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); assertThat(foo.get("key"), equalTo("value")); } - + /** * Test parsing XML with array elements (multiple elements with same name). */ public void testParseXmlWithArrayValue() { String xml = "value1value2"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); List keyValues = (List) foo.get("key"); assertThat(keyValues.size(), equalTo(2)); - + // The values might be nested inside their own lists Object firstValue = keyValues.get(0); assertThat(firstValue, equalTo("value1")); - + Object secondValue = keyValues.get(1); assertThat(secondValue, equalTo("value2")); } - + /** * Test parsing XML with nested elements. */ public void testParseXmlWithNestedElements() { String xml = "value"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); Map key1Map = (Map) foo.get("key1"); assertThat(key1Map.size(), equalTo(1)); - + String key2Value = (String) key1Map.get("key2"); assertThat(key2Value, equalTo("value")); } @@ -112,13 +113,13 @@ public void testParseXmlWithNestedElements() { */ public void testParseXmlInSingleItemArray() { String xml = ""; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); assertThat(foo.get("bar"), equalTo("baz")); @@ -129,21 +130,21 @@ public void testParseXmlInSingleItemArray() { */ public void testXPathSingleElementExtraction() { String xml = "helloworld"; - + Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + // Get the XPath result Object barContent = ingestDocument.getFieldValue("bar_content", Object.class); assertNotNull(barContent); assertEquals("hello", barContent); - + // Verify that the full parsed XML is also available Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); @@ -157,18 +158,18 @@ public void testXPathSingleElementExtraction() { */ public void testXPathMultipleElementsExtraction() { String xml = "firstsecondthird"; - + Map xpathMap = Map.of("/foo/bar", "all_bars"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + List allBars = ingestDocument.getFieldValue("all_bars", List.class); - + assertNotNull(allBars); assertThat(allBars.size(), equalTo(3)); assertThat(allBars.get(0), equalTo("first")); @@ -181,22 +182,22 @@ public void testXPathMultipleElementsExtraction() { */ public void testXPathAttributeExtraction() { String xml = "content"; - + Map xpathMap = new HashMap<>(); xpathMap.put("/foo/bar/@id", "bar_id"); xpathMap.put("/foo/bar/@type", "bar_type"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + String barId = ingestDocument.getFieldValue("bar_id", String.class); assertNotNull(barId); assertThat(barId, equalTo("123")); - + String barType = ingestDocument.getFieldValue("bar_type", String.class); assertNotNull(barType); assertThat(barType, equalTo("test")); @@ -206,15 +207,15 @@ public void testXPathAttributeExtraction() { * Test extracting elements with namespaces using XPath. */ public void testXPathNamespacedExtraction() { - String xml = "" + - "" + - " namespace-value" + - " regular-value" + - ""; - + String xml = "" + + "" + + " namespace-value" + + " regular-value" + + ""; + Map namespaces = Map.of("myns", "http://example.org/ns1"); Map xpathMap = Map.of("//myns:element/text()", "ns_value"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); config.put("namespaces", namespaces); @@ -222,7 +223,7 @@ public void testXPathNamespacedExtraction() { IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + String nsValue = ingestDocument.getFieldValue("ns_value", String.class); assertNotNull(nsValue); assertThat(nsValue, equalTo("namespace-value")); @@ -233,61 +234,61 @@ public void testXPathNamespacedExtraction() { */ public void testParseXmlWithMixedContent() { String xml = "This text is bold and this is italic!"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); - + assertNotNull(foo.get("b")); - assertThat((String)foo.get("b"), equalTo("bold")); + assertThat((String) foo.get("b"), equalTo("bold")); assertNotNull(foo.get("i")); - assertThat((String)foo.get("i"), equalTo("italic")); + assertThat((String) foo.get("i"), equalTo("italic")); assertNotNull(foo.get("#text")); - assertThat((String)foo.get("#text"), equalTo("This text is and this is !")); + assertThat((String) foo.get("#text"), equalTo("This text is and this is !")); } - + /** * Test parsing XML with CDATA sections. */ public void testParseXmlWithCDATA() { String xml = " that shouldn't be parsed!]]>"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Object content = data.get("foo"); - + assertNotNull(content); assertThat(content, equalTo("This is CDATA content with that shouldn't be parsed!")); } - + /** * Test parsing XML with numeric data. */ public void testParseXmlWithNumericData() { String xml = "12399.95true"; - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); - - assertThat((String)foo.get("count"), equalTo("123")); - assertThat((String)foo.get("price"), equalTo("99.95")); - assertThat((String)foo.get("active"), equalTo("true")); + + assertThat((String) foo.get("count"), equalTo("123")); + assertThat((String) foo.get("price"), equalTo("99.95")); + assertThat((String) foo.get("active"), equalTo("true")); } /** @@ -295,17 +296,17 @@ public void testParseXmlWithNumericData() { */ public void testParseXmlWithForceArray() { String xml = "single_value"; - + Map config = new HashMap<>(); config.put("force_array", true); // Enable force_array option XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map data = (Map) ingestDocument.getFieldValue(TARGET_FIELD, Object.class); Map foo = (Map) data.get("foo"); - + // With force_array=true, even single values should be in arrays Object barValue = foo.get("bar"); assertNotNull(barValue); @@ -321,205 +322,204 @@ public void testParseXmlWithForceArray() { * Tests that multiple XPath expressions can be used simultaneously. */ public void testMultipleXPathExpressions() { - String xml = "" + - " John30" + - " Jane25" + - ""; - + String xml = "" + + " John30" + + " Jane25" + + ""; + // Configure multiple XPath expressions Map xpathMap = new HashMap<>(); xpathMap.put("/root/person[1]/n/text()", "first_person_name"); xpathMap.put("/root/person[2]/n/text()", "second_person_name"); xpathMap.put("/root/person/@id", "person_ids"); - + Map config = new HashMap<>(); config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + assertTrue("first_person_name field should exist", ingestDocument.hasField("first_person_name")); assertTrue("second_person_name field should exist", ingestDocument.hasField("second_person_name")); assertTrue("person_ids field should exist", ingestDocument.hasField("person_ids")); - + Object firstName = ingestDocument.getFieldValue("first_person_name", Object.class); assertEquals("John", firstName); - + Object secondName = ingestDocument.getFieldValue("second_person_name", Object.class); assertEquals("Jane", secondName); - + Object personIdsObj = ingestDocument.getFieldValue("person_ids", Object.class); assertTrue("person_ids should be a List", personIdsObj instanceof List); List personIds = (List) personIdsObj; assertEquals("Should have 2 person IDs", 2, personIds.size()); assertEquals("First person ID should be '1'", "1", personIds.get(0)); assertEquals("Second person ID should be '2'", "2", personIds.get(1)); - + assertTrue("Target field should exist", ingestDocument.hasField(TARGET_FIELD)); } - + /** * Test handling of invalid XML with ignoreFailure=false. */ public void testInvalidXml() { String xml = ""; // Invalid XML missing closing tag - + Map config = new HashMap<>(); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { - processor.execute(ingestDocument); - }); - - assertTrue("Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || - exception.getCause().getMessage().contains("XML")); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(ingestDocument); }); + + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); } - + /** * Test handling of invalid XML with ignoreFailure=true. */ public void testInvalidXmlWithIgnoreFailure() { String xml = ""; // Invalid XML missing closing tag - + Map config = new HashMap<>(); config.put("ignore_failure", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + List tags = ingestDocument.getFieldValue("tags", List.class); assertNotNull(tags); assertTrue(tags.contains("_xmlparsefailure")); } - + /** * Test the store_xml=false option to not store parsed XML in target field. */ public void testNoStoreXml() { String xml = "value"; - + // Set up XPath to extract value but don't store XML Map xpathMap = Map.of("/foo/bar/text()", "bar_content"); - + Map config = new HashMap<>(); config.put("store_xml", false); // Do not store XML in target field - config.put("xpath", xpathMap); + config.put("xpath", xpathMap); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + // Verify XPath result is stored String barContent = ingestDocument.getFieldValue("bar_content", String.class); assertNotNull(barContent); assertThat(barContent, equalTo("value")); - + // Verify the target field was not created assertFalse(ingestDocument.hasField(TARGET_FIELD)); } - + /** * Test the to_lower option for converting field names to lowercase. */ public void testToLower() { String xml = "value"; - + Map config = new HashMap<>(); config.put("to_lower", true); // Enable to_lower option XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + // Verify field names are lowercase Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); assertTrue(data.containsKey("foo")); assertFalse(data.containsKey("FOO")); - + Map foo = (Map) data.get("foo"); assertTrue(foo.containsKey("bar")); assertFalse(foo.containsKey("BAR")); assertThat(foo.get("bar"), equalTo("value")); } - + /** * Test the ignore_missing option when field is missing. */ public void testIgnoreMissing() { String xmlField = "nonexistent_field"; - + Map config = new HashMap<>(); config.put("field", xmlField); config.put("ignore_missing", true); // Enable ignore_missing option XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = new IngestDocument("_index", "_id", 1, null, null, new HashMap<>(Map.of())); processor.execute(ingestDocument); - - assertFalse("Target field should not be created when source field is missing", - ingestDocument.hasField(TARGET_FIELD)); - + + assertFalse("Target field should not be created when source field is missing", ingestDocument.hasField(TARGET_FIELD)); + // With ignoreMissing=false config.put("ignore_missing", false); XmlProcessor failingProcessor = createTestProcessor(config); - + // This should throw an exception - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { - failingProcessor.execute(ingestDocument); - }); - + IllegalArgumentException exception = expectThrows( + IllegalArgumentException.class, + () -> { failingProcessor.execute(ingestDocument); } + ); + assertTrue(exception.getMessage().contains("not present as part of path")); } - + /** * Test that ignore_empty_value correctly filters out empty values from arrays and mixed content. */ public void testIgnoreEmptyValue() { // XML with mixed empty and non-empty elements, including array elements with mixed empty/non-empty values - String xml = "" + - " " + - " " + - " content" + - " nested-content" + - " " + - " first" + - " " + - " third" + - " " + - " fifth" + - " " + - " Text with and content" + - ""; + String xml = "" + + " " + + " " + + " content" + + " nested-content" + + " " + + " first" + + " " + + " third" + + " " + + " fifth" + + " " + + " Text with and content" + + ""; Map config = new HashMap<>(); config.put("ignore_empty_value", true); XmlProcessor processor = createTestProcessor(config); - + IngestDocument ingestDocument = createTestIngestDocument(xml); processor.execute(ingestDocument); - + Map result = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map root = (Map) result.get("root"); - + // Check empty elements are filtered assertFalse("Empty element should be filtered out", root.containsKey("empty")); assertFalse("Blank element should be filtered out", root.containsKey("blank")); - + // Check valid elements are preserved assertTrue("Valid element should be preserved", root.containsKey("valid")); assertEquals("content", root.get("valid")); - + // Check nested structure filtering Map nested = (Map) root.get("nested"); assertNotNull("Nested element should be preserved", nested); assertFalse("Empty nested element should be filtered", nested.containsKey("empty")); assertEquals("nested-content", nested.get("valid")); - + // Check array with mixed empty/non-empty values - Map items = (Map) root.get("items"); + Map items = (Map) root.get("items"); assertNotNull("Items object should be preserved", items); List itemList = (List) items.get("item"); assertNotNull("Item array should be preserved", itemList); @@ -527,7 +527,7 @@ public void testIgnoreEmptyValue() { assertEquals("first", itemList.get(0)); assertEquals("third", itemList.get(1)); assertEquals("fifth", itemList.get(2)); - + // Check mixed content handling Map mixed = (Map) root.get("mixed"); assertNotNull("Mixed content should be preserved", mixed); @@ -536,66 +536,65 @@ public void testIgnoreEmptyValue() { assertEquals("content", mixed.get("valid")); assertEquals("Text with and", mixed.get("#text")); } - + /** * Test parsing with strict mode option. */ public void testStrictParsing() { String xml = "valid"; - + Map config = new HashMap<>(); config.put("parse_options", "strict"); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); assertThat(foo.get("bar"), equalTo("valid")); - + // Test with invalid XML in strict mode String invalidXml = ""; IngestDocument invalidDocument = createTestIngestDocument(invalidXml); - - IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { - processor.execute(invalidDocument); - }); - - assertTrue("Error message should indicate XML is invalid", - exception.getMessage().contains("invalid XML") || - exception.getCause().getMessage().contains("XML")); + + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> { processor.execute(invalidDocument); }); + + assertTrue( + "Error message should indicate XML is invalid", + exception.getMessage().contains("invalid XML") || exception.getCause().getMessage().contains("XML") + ); } - + /** * Test parsing XML with remove_namespaces option. */ public void testRemoveNamespaces() { String xml = "value"; - + Map config = new HashMap<>(); config.put("remove_namespaces", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - + assertTrue("Element with namespace should be present", foo.containsKey("ns:bar")); assertThat(foo.get("ns:bar"), equalTo("value")); - + // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml); - + config.put("remove_namespaces", false); XmlProcessor processor2 = createTestProcessor(config); processor2.execute(ingestDocument2); - + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); Map foo2 = (Map) data2.get("foo"); - + // With removeNamespaces=false, the "ns:" prefix should be preserved assertTrue("Element should be accessible with namespace prefix", foo2.containsKey("ns:bar")); assertThat(foo2.get("ns:bar"), equalTo("value")); @@ -606,30 +605,30 @@ public void testRemoveNamespaces() { */ public void testForceContent() { String xml = "simple text"; - + Map config = new HashMap<>(); config.put("force_content", true); XmlProcessor processor = createTestProcessor(config); IngestDocument ingestDocument = createTestIngestDocument(xml); - + processor.execute(ingestDocument); - + Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - + // With forceContent=true, the text should be in a #text field assertTrue("Text content should be in #text field", foo.containsKey("#text")); assertThat(foo.get("#text"), equalTo("simple text")); - + // Now test with forceContent=false config.put("force_content", false); XmlProcessor processor2 = createTestProcessor(config); IngestDocument ingestDocument2 = createTestIngestDocument(xml); - + processor2.execute(ingestDocument2); - + Map data2 = ingestDocument2.getFieldValue(TARGET_FIELD, Map.class); - + // With forceContent=false, the text should be directly assigned to the element assertThat(data2.get("foo"), equalTo("simple text")); } From 55ebcf96b07ebcdecb90e9ebe3552e72e6fdbafc Mon Sep 17 00:00:00 2001 From: Marc Guasch Date: Fri, 4 Jul 2025 16:22:58 +0200 Subject: [PATCH 8/9] fix: implement Copilot PR review suggestions - Fix test assertion for remove_namespaces feature - Use StandardCharsets.UTF_8 instead of string literal - Replace string reference comparison with isEmpty() - Move regex pattern to static final field for performance --- .../ingest/common/XmlProcessor.java | 20 +++++++++++-------- .../ingest/common/XmlProcessorTests.java | 4 ++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index 582b8d9f3e787..bfbbe05334714 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -15,12 +15,14 @@ import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.regex.Pattern; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.DocumentBuilder; @@ -52,6 +54,9 @@ public final class XmlProcessor extends AbstractProcessor { private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); + // Pre-compiled pattern to detect namespace prefixes + private static final Pattern NAMESPACE_PATTERN = Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + // Pre-configured SAX parser factories for secure XML parsing private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); @@ -383,16 +388,14 @@ public Iterator getPrefixes(String namespaceURI) { }); } - // Pre-compiled pattern to detect namespace prefixes - java.util.regex.Pattern namespacePattern = - java.util.regex.Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); + // Use pre-compiled pattern to detect namespace prefixes for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); // Validate namespace prefixes if no namespaces are configured - if (!hasNamespaces && namespacePattern.matcher(xpathExpression).matches()) { + if (!hasNamespaces && NAMESPACE_PATTERN.matcher(xpathExpression).matches()) { throw new IllegalArgumentException( "Invalid XPath expression [" + xpathExpression + "]: contains namespace prefixes but no namespace configuration provided" ); @@ -476,7 +479,7 @@ public XmlProcessor create( // Parse parse_options parameter String parseOptions = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "parse_options", ""); - if (parseOptions != null && parseOptions != "" && !"strict".equals(parseOptions)) { + if (parseOptions != null && !parseOptions.isEmpty() && !"strict".equals(parseOptions)) { throw new IllegalArgumentException("Invalid parse_options [" + parseOptions + "]. Only 'strict' is supported."); } @@ -528,7 +531,7 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes("UTF-8")), handler); + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); // Store structured result if needed if (storeXml) { @@ -926,10 +929,11 @@ private static DocumentBuilderFactory createSecureDocumentBuilderFactory() { * @return the appropriate SAX parser factory for the current configuration */ private javax.xml.parsers.SAXParserFactory selectSaxParserFactory() { + boolean needsNamespaceAware = hasNamespaces() || removeNamespaces; if (isStrict()) { - return hasNamespaces() ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; + return needsNamespaceAware ? SAX_PARSER_FACTORY_NS_STRICT : SAX_PARSER_FACTORY_STRICT; } else { - return hasNamespaces() ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; + return needsNamespaceAware ? SAX_PARSER_FACTORY_NS : SAX_PARSER_FACTORY; } } } diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index e48c6de593f84..3df19ddf06269 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -583,8 +583,8 @@ public void testRemoveNamespaces() { Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - assertTrue("Element with namespace should be present", foo.containsKey("ns:bar")); - assertThat(foo.get("ns:bar"), equalTo("value")); + assertTrue("Element without namespace should be present", foo.containsKey("bar")); + assertThat(foo.get("bar"), equalTo("value")); // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml); From 6e531b2a1e13f8561b3463031bd3e71d105d7f0e Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 4 Jul 2025 14:35:48 +0000 Subject: [PATCH 9/9] [CI] Auto commit changes from spotless --- .../elasticsearch/ingest/common/XmlProcessor.java | 12 ++++++------ .../ingest/common/XmlProcessorTests.java | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java index c8b9e9a723412..3c2fac023fa20 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/XmlProcessor.java @@ -52,10 +52,10 @@ public final class XmlProcessor extends AbstractProcessor { public static final String TYPE = "xml"; private static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance(); - + // Pre-compiled pattern to detect namespace prefixes private static final Pattern NAMESPACE_PATTERN = Pattern.compile(".*\\b[a-zA-Z][a-zA-Z0-9_-]*:[a-zA-Z][a-zA-Z0-9_-]*.*"); - + // Pre-configured SAX parser factories for secure XML parsing private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY = createSecureSaxParserFactory(); private static final javax.xml.parsers.SAXParserFactory SAX_PARSER_FACTORY_NS = createSecureSaxParserFactoryNamespaceAware(); @@ -390,9 +390,9 @@ public Iterator getPrefixes(String namespaceURI) { } }); } - + // Use pre-compiled pattern to detect namespace prefixes - + for (Map.Entry entry : xpathExpressions.entrySet()) { String xpathExpression = entry.getKey(); String targetFieldName = entry.getValue(); @@ -557,9 +557,9 @@ public void fatalError(org.xml.sax.SAXParseException exception) throws org.xml.s // Use enhanced handler that can build DOM during streaming when needed XmlStreamingWithDomHandler handler = new XmlStreamingWithDomHandler(needsDom); - + parser.parse(new java.io.ByteArrayInputStream(xmlString.getBytes(StandardCharsets.UTF_8)), handler); - + // Store structured result if needed if (storeXml) { Object streamingResult = handler.getStructuredResult(); diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java index a02db6c43e59d..c01c086be1eb5 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/XmlProcessorTests.java @@ -581,10 +581,10 @@ public void testRemoveNamespaces() { Map data = ingestDocument.getFieldValue(TARGET_FIELD, Map.class); Map foo = (Map) data.get("foo"); - + assertTrue("Element without namespace should be present", foo.containsKey("bar")); assertThat(foo.get("bar"), equalTo("value")); - + // Now test with removeNamespaces=false IngestDocument ingestDocument2 = createTestIngestDocument(xml);