diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 19bc69d..ef7231e 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -35,6 +35,7 @@ #include "iceberg/avro/avro_data_util_internal.h" #include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/avro/avro_stream_internal.h" +#include "iceberg/name_mapping.h" #include "iceberg/schema_internal.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" @@ -96,16 +97,26 @@ class AvroReader::Impl { // Create a base reader without setting reader schema to enable projection. auto base_reader = std::make_unique<::avro::DataFileReaderBase>(std::move(input_stream)); - const ::avro::ValidSchema& file_schema = base_reader->dataSchema(); + ::avro::ValidSchema file_schema = base_reader->dataSchema(); // Validate field ids in the file schema. HasIdVisitor has_id_visitor; ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema)); + if (has_id_visitor.HasNoIds()) { - // TODO(gangwu): support applying field-ids based on name mapping - return NotImplemented("Avro file schema has no field IDs"); - } - if (!has_id_visitor.AllHaveIds()) { + // Apply field IDs based on name mapping if available + if (options.name_mapping) { + ICEBERG_ASSIGN_OR_RAISE( + auto new_root_node, + MakeAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); + + // Update the file schema to use the new schema with field IDs + file_schema = ::avro::ValidSchema(new_root_node); + } else { + return InvalidSchema( + "Avro file schema has no field IDs and no name mapping provided"); + } + } else if (!has_id_visitor.AllHaveIds()) { return InvalidSchema("Not all fields in the Avro file schema have field IDs"); } diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 28e9e7b..e30bdbf 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -31,7 +31,9 @@ #include "iceberg/avro/avro_register.h" #include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/avro/constants.h" #include "iceberg/metadata_columns.h" +#include "iceberg/name_mapping.h" #include "iceberg/schema.h" #include "iceberg/schema_util_internal.h" #include "iceberg/util/formatter.h" @@ -773,4 +775,278 @@ Result Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); + } + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } + } + } + + if (nested_field) { + // Check if field_id is present + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + const auto& original_attrs = original_node->customAttributesAt(i); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + // Copy each existing attribute to preserve original metadata + attributes.addAttribute(attr_pair.first, attr_pair.second, false); + } + } + + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), false); + + new_record_node->addCustomAttributesForField(attributes); + + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + MakeAvroNodeWithFieldIds(field_node, *nested_field)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); + } else { + // If no nested field found, this is an error + return InvalidSchema("Field '{}' not found in nested mapping", field_name); + } + } + + return new_record_node; +} + +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + auto new_array_node = std::make_shared<::avro::NodeArray>(); + + // Check if this is a map represented as array + if (HasMapLogicalType(original_node)) { + ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), field)); + new_array_node->addLeaf(new_element_node); + return new_array_node; + } + + // For regular arrays, try to find element field ID from nested mapping + const MappedField* element_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(std::string(kElement)) != f.names.end()) { + element_field = &f; + break; + } + } + } + + if (element_field) { + // Check if field_id is present + if (!element_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for element field in array"); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), *element_field)); + new_array_node->addLeaf(new_element_node); + + // Add element field ID as custom attribute + ::avro::CustomAttributes element_attributes; + element_attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(*element_field->field_id), false); + new_array_node->addCustomAttributesForField(element_attributes); + } else { + // If no element field found, this is an error + return InvalidSchema("Element field not found in nested mapping for array"); + } + + return new_array_node; +} + +Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 2) { + return InvalidSchema("Map type must have exactly two leaves"); + } + + auto new_map_node = std::make_shared<::avro::NodeMap>(); + + // For map types, we need to extract key and value field mappings from the nested + // mapping + if (!field.nested_mapping) { + return InvalidSchema("Map type requires nested mapping for key and value fields"); + } + + // Find key and value field mappings by name + std::optional key_id = field.nested_mapping->Id("key"); + std::optional value_id = field.nested_mapping->Id("value"); + + if (!key_id || !value_id) { + return InvalidSchema("Map type requires both 'key' and 'value' field mappings"); + } + + std::optional key_field_ref = field.nested_mapping->Field(*key_id); + std::optional value_field_ref = + field.nested_mapping->Field(*value_id); + + if (!key_field_ref || !value_field_ref) { + return InvalidSchema("Map type requires both key and value field mappings"); + } + + const auto& key_mapped_field = key_field_ref->get(); + const auto& value_mapped_field = value_field_ref->get(); + + if (!key_mapped_field.field_id || !value_mapped_field.field_id) { + return InvalidSchema("Map key and value fields must have field IDs"); + } + + // Create key field with mapped field ID + MappedField key_field; + key_field.field_id = *key_mapped_field.field_id; + key_field.nested_mapping = key_mapped_field.nested_mapping; + + // Create value field with mapped field ID + MappedField value_field; + value_field.field_id = *value_mapped_field.field_id; + value_field.nested_mapping = value_mapped_field.nested_mapping; + + // Add key and value nodes + ICEBERG_ASSIGN_OR_RAISE(auto new_key_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), key_field)); + ICEBERG_ASSIGN_OR_RAISE( + auto new_value_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(1), value_field)); + new_map_node->addLeaf(new_key_node); + new_map_node->addLeaf(new_value_node); + + // Preserve existing custom attributes from the original node and add field ID + // attributes Copy existing attributes from the original node (if any) + if (original_node->customAttributes() > 0) { + const auto& original_attrs = original_node->customAttributesAt(0); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + // Copy each existing attribute to preserve original metadata + ::avro::CustomAttributes attributes; + attributes.addAttribute(attr_pair.first, attr_pair.second, false); + new_map_node->addCustomAttributesForField(attributes); + } + } + + ::avro::CustomAttributes key_attributes; + key_attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(*key_mapped_field.field_id), false); + new_map_node->addCustomAttributesForField(key_attributes); + + ::avro::CustomAttributes value_attributes; + value_attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(*value_mapped_field.field_id), false); + new_map_node->addCustomAttributesForField(value_attributes); + + return new_map_node; +} + +Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 2) { + return InvalidSchema("Union type must have exactly two branches"); + } + + const auto& branch_0 = original_node->leafAt(0); + const auto& branch_1 = original_node->leafAt(1); + + bool branch_0_is_null = (branch_0->type() == ::avro::AVRO_NULL); + bool branch_1_is_null = (branch_1->type() == ::avro::AVRO_NULL); + + if (branch_0_is_null && !branch_1_is_null) { + // branch_0 is null, branch_1 is not null + ICEBERG_ASSIGN_OR_RAISE(auto new_branch_1, MakeAvroNodeWithFieldIds(branch_1, field)); + auto new_union_node = std::make_shared<::avro::NodeUnion>(); + new_union_node->addLeaf(branch_0); // null branch + new_union_node->addLeaf(new_branch_1); + return new_union_node; + } else if (!branch_0_is_null && branch_1_is_null) { + // branch_0 is not null, branch_1 is null + ICEBERG_ASSIGN_OR_RAISE(auto new_branch_0, MakeAvroNodeWithFieldIds(branch_0, field)); + auto new_union_node = std::make_shared<::avro::NodeUnion>(); + new_union_node->addLeaf(new_branch_0); + new_union_node->addLeaf(branch_1); // null branch + return new_union_node; + } else if (branch_0_is_null && branch_1_is_null) { + // Both branches are null - this is invalid + return InvalidSchema("Union type cannot have two null branches"); + } else { + // Neither branch is null - this is invalid + return InvalidSchema("Union type must have exactly one null branch"); + } +} + +} // namespace + +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& mapped_field) { + switch (original_node->type()) { + case ::avro::AVRO_RECORD: + return CreateRecordNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_ARRAY: + return CreateArrayNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_MAP: + return CreateMapNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_UNION: + return CreateUnionNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_BOOL: + case ::avro::AVRO_INT: + case ::avro::AVRO_LONG: + case ::avro::AVRO_FLOAT: + case ::avro::AVRO_DOUBLE: + case ::avro::AVRO_STRING: + case ::avro::AVRO_BYTES: + case ::avro::AVRO_FIXED: + // For primitive types, just return a copy + return original_node; + case ::avro::AVRO_NULL: + case ::avro::AVRO_ENUM: + default: + return InvalidSchema("Unsupported Avro type for field ID application: {}", + ToString(original_node)); + } +} + +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& mapping) { + MappedField mapped_field; + mapped_field.nested_mapping = std::make_shared(mapping.AsMappedFields()); + return MakeAvroNodeWithFieldIds(original_node, mapped_field); +} + } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index de3922a..ac00834 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -23,6 +23,7 @@ #include +#include "iceberg/name_mapping.h" #include "iceberg/result.h" #include "iceberg/schema_util.h" #include "iceberg/type.h" @@ -148,4 +149,18 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type); /// \return True if the node has a map logical type, false otherwise. bool HasMapLogicalType(const ::avro::NodePtr& node); +/// \brief Create a new Avro node with field IDs from name mapping. +/// \param original_node The original Avro node to copy. +/// \param mapped_field The mapped field to apply field IDs from. +/// \return A new Avro node with field IDs applied, or an error. +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& mapped_field); + +/// \brief Create a new Avro node with field IDs from name mapping. +/// \param original_node The original Avro node to copy. +/// \param mapping The name mapping to apply field IDs from. +/// \return A new Avro node with field IDs applied, or an error. +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& mapping); + } // namespace iceberg::avro diff --git a/src/iceberg/avro/constants.h b/src/iceberg/avro/constants.h new file mode 100644 index 0000000..b56cc66 --- /dev/null +++ b/src/iceberg/avro/constants.h @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg::avro { + +// Avro logical type constants +constexpr std::string_view kMapLogicalType = "map"; + +// Name mapping field constants +constexpr std::string_view kElement = "element"; +constexpr std::string_view kKey = "key"; +constexpr std::string_view kValue = "value"; + +} // namespace iceberg::avro diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h index 6fc7522..8a59e33 100644 --- a/src/iceberg/file_reader.h +++ b/src/iceberg/file_reader.h @@ -86,6 +86,9 @@ struct ICEBERG_EXPORT ReaderOptions { /// \brief The filter to apply to the data. Reader implementations may ignore this if /// the file format does not support filtering. std::shared_ptr filter; + /// \brief Name mapping for schema evolution compatibility. Used when reading files + /// that may have different field names than the current schema. + std::shared_ptr name_mapping; /// \brief Format-specific or implementation-specific properties. std::unordered_map properties; }; diff --git a/src/iceberg/name_mapping.cc b/src/iceberg/name_mapping.cc index ef9a0bb..eaf6199 100644 --- a/src/iceberg/name_mapping.cc +++ b/src/iceberg/name_mapping.cc @@ -152,7 +152,7 @@ const std::unordered_map& MappedFields::LazyIdToFi NameMapping::NameMapping(std::unique_ptr mapping) : mapping_(std::move(mapping)) {} -std::optional NameMapping::Find(int32_t id) { +std::optional NameMapping::Find(int32_t id) const { const auto& fields_by_id = LazyFieldsById(); if (auto iter = fields_by_id.find(id); iter != fields_by_id.cend()) { return iter->second; @@ -160,14 +160,15 @@ std::optional NameMapping::Find(int32_t id) { return std::nullopt; } -std::optional NameMapping::Find(std::span names) { +std::optional NameMapping::Find( + std::span names) const { if (names.empty()) { return std::nullopt; } return Find(JoinByDot(names)); } -std::optional NameMapping::Find(const std::string& name) { +std::optional NameMapping::Find(const std::string& name) const { const auto& fields_by_name = LazyFieldsByName(); if (auto iter = fields_by_name.find(name); iter != fields_by_name.cend()) { return iter->second; diff --git a/src/iceberg/name_mapping.h b/src/iceberg/name_mapping.h index 1b7b6f3..3a66be0 100644 --- a/src/iceberg/name_mapping.h +++ b/src/iceberg/name_mapping.h @@ -104,13 +104,13 @@ class ICEBERG_EXPORT NameMapping { static std::unique_ptr MakeEmpty(); /// \brief Find a field by its ID. - std::optional Find(int32_t id); + std::optional Find(int32_t id) const; /// \brief Find a field by its unconcatenated names. - std::optional Find(std::span names); + std::optional Find(std::span names) const; /// \brief Find a field by its (concatenated) name. - std::optional Find(const std::string& name); + std::optional Find(const std::string& name) const; /// \brief Get the underlying MappedFields instance. const MappedFields& AsMappedFields() const; diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index 875bfe9..1e971ec 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -25,7 +25,9 @@ #include #include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/json_internal.h" #include "iceberg/metadata_columns.h" +#include "iceberg/name_mapping.h" #include "iceberg/schema.h" #include "matchers.h" @@ -1057,4 +1059,454 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { ASSERT_THAT(projection_result, HasErrorMessage("Cannot read")); } +// NameMapping tests for Avro schema context +class NameMappingAvroSchemaTest : public ::testing::Test { + protected: + // Helper function to create a simple name mapping + std::unique_ptr CreateSimpleNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a nested name mapping + std::unique_ptr CreateNestedNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for address + std::vector address_fields; + address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); + address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); + address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); + auto address_mapping = MappedFields::Make(std::move(address_fields)); + + fields.emplace_back(MappedField{.names = {"address"}, + .field_id = 2, + .nested_mapping = std::move(address_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for array types + std::unique_ptr CreateArrayNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element + std::vector element_fields; + element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + fields.emplace_back(MappedField{ + .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for map types + std::unique_ptr CreateMapNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for map key-value + std::vector map_fields; + map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); + map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); + auto map_mapping = MappedFields::Make(std::move(map_fields)); + + fields.emplace_back(MappedField{.names = {"properties"}, + .field_id = 2, + .nested_mapping = std::move(map_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for union types + std::unique_ptr CreateUnionNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for complex map types + // (array>) + std::unique_ptr CreateComplexMapNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element (struct) + std::vector element_fields; + element_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 40}); + element_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 41}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + // Nested mapping for array + std::vector array_fields; + array_fields.emplace_back(MappedField{.names = {"element"}, + .field_id = 50, + .nested_mapping = std::move(element_mapping)}); + auto array_mapping = MappedFields::Make(std::move(array_fields)); + + fields.emplace_back(MappedField{ + .names = {"entries"}, .field_id = 2, .nested_mapping = std::move(array_mapping)}); + + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { + // Create a simple Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateSimpleNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 3); + EXPECT_EQ(node->leaves(), 3); + + // Check that field IDs are properly applied + ASSERT_EQ(node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { + // Create a nested Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "address", "type": { + "type": "record", + "name": "address", + "fields": [ + {"name": "street", "type": "string"}, + {"name": "city", "type": "string"}, + {"name": "zip", "type": "string"} + ] + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateNestedNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 2); + EXPECT_EQ(node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + + // Check nested record + const auto& address_node = node->leafAt(1); + EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(address_node->names(), 3); + EXPECT_EQ(address_node->leaves(), 3); + + // Check that field IDs are properly applied to nested fields + ASSERT_EQ(address_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/0, /*field_id=*/10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/1, /*field_id=*/11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/2, /*field_id=*/12)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { + // Create an Avro array schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateArrayNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure and element field ID + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Check that array element has field ID applied + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(array_node, /*index=*/0, /*field_id=*/20)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { + // Create an Avro map schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateMapNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check map field structure and key-value field IDs + const auto& map_node = new_node->leafAt(1); + EXPECT_EQ(map_node->type(), ::avro::AVRO_MAP); + ASSERT_GE(map_node->leaves(), 2); + EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); + EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); + ASSERT_EQ(map_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/0, /*field_id=*/30)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/1, /*field_id=*/31)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToComplexMap) { + // Create an Avro schema for complex map (array>) without field IDs + // This represents a map where key is not a string type + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "entries", "type": { + "type": "array", + "items": { + "type": "record", + "name": "entry", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "string"} + ] + } + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateComplexMapNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure (representing the map) + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Check array element (struct) + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(element_node->names(), 2); + EXPECT_EQ(element_node->leaves(), 2); + + // Check that field IDs are properly applied to struct fields + ASSERT_EQ(element_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(element_node, /*index=*/0, /*field_id=*/40)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(element_node, /*index=*/1, /*field_id=*/41)); + + // Check key and value types + EXPECT_EQ(element_node->leafAt(0)->type(), ::avro::AVRO_INT); + EXPECT_EQ(element_node->leafAt(1)->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { + // Create an Avro union schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "data", "type": ["null", "string"]} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateUnionNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check union field + const auto& union_node = new_node->leafAt(1); + EXPECT_EQ(union_node->type(), ::avro::AVRO_UNION); + EXPECT_EQ(union_node->leaves(), 2); + + // Check that the non-null branch has field ID applied + const auto& non_null_branch = union_node->leafAt(1); + EXPECT_EQ(non_null_branch->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { + // Create a name mapping with missing field ID + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}}); // Missing field_id + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create a simple Avro record schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, + HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { + // Create a name mapping + auto name_mapping = CreateSimpleNameMapping(); + + // Create an Avro record schema with a field not in the mapping + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "unknown_field", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, + HasErrorMessage("Field 'unknown_field' not found in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { + // Create a name mapping without array element mapping + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create an Avro array schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { + // Create a name mapping without map key/value mapping + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create an Avro map schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); +} } // namespace iceberg::avro diff --git a/test/name_mapping_test.cc b/test/name_mapping_test.cc index 53c9fe0..70f7156 100644 --- a/test/name_mapping_test.cc +++ b/test/name_mapping_test.cc @@ -137,6 +137,58 @@ TEST_F(NameMappingTest, FindByNameParts) { } } +TEST_F(NameMappingTest, FindMethodsOnConstObject) { + auto mapping = MakeNameMapping(); + const NameMapping& const_mapping = *mapping; + + // Test Find by ID on const object + auto field_by_id = const_mapping.Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("foo", "bar")); + + // Test Find by name on const object + auto field_by_name = const_mapping.Find("baz"); + ASSERT_TRUE(field_by_name.has_value()); + EXPECT_EQ(field_by_name->get().field_id, 2); + EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("baz")); + + // Test Find by name parts on const object + auto field_by_parts = const_mapping.Find(std::vector{"qux", "hello"}); + ASSERT_TRUE(field_by_parts.has_value()); + EXPECT_EQ(field_by_parts->get().field_id, 4); + EXPECT_THAT(field_by_parts->get().names, testing::UnorderedElementsAre("hello")); + + // Test Find non-existent field on const object + auto non_existent = const_mapping.Find(999); + EXPECT_FALSE(non_existent.has_value()); + + auto non_existent_name = const_mapping.Find("non_existent"); + EXPECT_FALSE(non_existent_name.has_value()); + + auto non_existent_parts = + const_mapping.Find(std::vector{"non", "existent"}); + EXPECT_FALSE(non_existent_parts.has_value()); +} + +TEST_F(NameMappingTest, FindMethodsOnConstEmptyMapping) { + auto empty_mapping = NameMapping::MakeEmpty(); + const NameMapping& const_empty_mapping = *empty_mapping; + + // Test Find by ID on const empty mapping + auto field_by_id = const_empty_mapping.Find(1); + EXPECT_FALSE(field_by_id.has_value()); + + // Test Find by name on const empty mapping + auto field_by_name = const_empty_mapping.Find("test"); + EXPECT_FALSE(field_by_name.has_value()); + + // Test Find by name parts on const empty mapping + auto field_by_parts = + const_empty_mapping.Find(std::vector{"test", "field"}); + EXPECT_FALSE(field_by_parts.has_value()); +} + TEST_F(NameMappingTest, Equality) { auto mapping1 = MakeNameMapping(); auto mapping2 = MakeNameMapping();