From 97bc93541eb2488e59d9d705a29dafe0d937ba11 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Sun, 27 Jul 2025 09:02:17 +0800 Subject: [PATCH 01/15] support applying field-ids based on name mapping --- src/iceberg/avro/avro_reader.cc | 163 ++++++++++++++++++++++++++++- src/iceberg/name_mapping.cc | 7 +- src/iceberg/name_mapping.h | 6 +- test/avro_reader_test.cc | 176 ++++++++++++++++++++++++++++++++ test/name_mapping_test.cc | 52 ++++++++++ 5 files changed, 394 insertions(+), 10 deletions(-) create mode 100644 test/avro_reader_test.cc diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 19bc69df..a4b3e2c9 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -30,11 +30,14 @@ #include #include #include +#include #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/avro/avro_data_util_internal.h" #include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/avro/avro_stream_internal.h" +#include "iceberg/json_internal.h" +#include "iceberg/name_mapping.h" #include "iceberg/schema_internal.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" @@ -101,11 +104,22 @@ class AvroReader::Impl { // Validate field ids in the file schema. HasIdVisitor has_id_visitor; ICEBERG_RETURN_UNEXPECTED(has_id_visitor.Visit(file_schema)); + if (has_id_visitor.HasNoIds()) { - // TODO(gangwu): support applying field-ids based on name mapping - return NotImplemented("Avro file schema has no field IDs"); - } - if (!has_id_visitor.AllHaveIds()) { + // Apply field IDs based on name mapping if available + auto name_mapping_iter = options.properties.find("name_mapping"); + if (name_mapping_iter != options.properties.end()) { + // Parse name mapping from JSON string + ICEBERG_ASSIGN_OR_RAISE(auto name_mapping, + iceberg::NameMappingFromJson( + nlohmann::json::parse(name_mapping_iter->second))); + ICEBERG_RETURN_UNEXPECTED( + ApplyFieldIdsFromNameMapping(file_schema.root(), *name_mapping)); + } else { + return NotImplemented( + "Avro file schema has no field IDs and no name mapping provided"); + } + } else if (!has_id_visitor.AllHaveIds()) { return InvalidSchema("Not all fields in the Avro file schema have field IDs"); } @@ -212,6 +226,147 @@ class AvroReader::Impl { return arrow_array; } + // Apply field IDs to Avro schema nodes based on name mapping + Status ApplyFieldIdsFromNameMapping(const ::avro::NodePtr& node, + const NameMapping& name_mapping) { + switch (node->type()) { + case ::avro::AVRO_RECORD: + return ApplyFieldIdsToRecord(node, name_mapping); + case ::avro::AVRO_ARRAY: + return ApplyFieldIdsToArray(node, name_mapping); + case ::avro::AVRO_MAP: + return ApplyFieldIdsToMap(node, name_mapping); + case ::avro::AVRO_UNION: + return ApplyFieldIdsToUnion(node, name_mapping); + case ::avro::AVRO_BOOL: + case ::avro::AVRO_INT: + case ::avro::AVRO_LONG: + case ::avro::AVRO_FLOAT: + case ::avro::AVRO_DOUBLE: + case ::avro::AVRO_STRING: + case ::avro::AVRO_BYTES: + case ::avro::AVRO_FIXED: + return {}; + case ::avro::AVRO_NULL: + case ::avro::AVRO_ENUM: + default: + return InvalidSchema("Unsupported Avro type for field ID application: {}", + static_cast(node->type())); + } + } + + Status ApplyFieldIdsToRecord(const ::avro::NodePtr& node, + const NameMapping& name_mapping) { + for (size_t i = 0; i < node->leaves(); ++i) { + const std::string& field_name = node->nameAt(i); + ::avro::NodePtr field_node = node->leafAt(i); + + // Try to find field ID by name in the name mapping + if (auto field_ref = name_mapping.Find(field_name)) { + if (field_ref->get().field_id.has_value()) { + // Add field ID attribute to the node + ::avro::CustomAttributes attributes; + attributes.addAttribute( + "field-id", std::to_string(field_ref->get().field_id.value()), false); + node->addCustomAttributesForField(attributes); + } + + // Recursively apply field IDs to nested fields if they exist + if (field_ref->get().nested_mapping && + field_node->type() == ::avro::AVRO_RECORD) { + const auto& nested_mapping = field_ref->get().nested_mapping; + auto fields_span = nested_mapping->fields(); + std::vector fields_vector(fields_span.begin(), fields_span.end()); + auto nested_name_mapping = NameMapping::Make(std::move(fields_vector)); + ICEBERG_RETURN_UNEXPECTED( + ApplyFieldIdsFromNameMapping(field_node, *nested_name_mapping)); + } else { + // Recursively apply field IDs to child nodes (only if not already handled by + // nested mapping) + ICEBERG_RETURN_UNEXPECTED( + ApplyFieldIdsFromNameMapping(field_node, name_mapping)); + } + } else { + // Recursively apply field IDs to child nodes even if no mapping found + ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(field_node, name_mapping)); + } + } + return {}; + } + + Status ApplyFieldIdsToArray(const ::avro::NodePtr& node, + const NameMapping& name_mapping) { + if (node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + // Check if this is a map represented as array + if (node->logicalType().type() == ::avro::LogicalType::CUSTOM && + node->logicalType().customLogicalType() != nullptr && + node->logicalType().customLogicalType()->name() == "map") { + return ApplyFieldIdsFromNameMapping(node->leafAt(0), name_mapping); + } + + // For regular arrays, try to find element field ID + if (auto element_field = name_mapping.Find("element")) { + if (element_field->get().field_id.has_value()) { + ::avro::CustomAttributes attributes; + attributes.addAttribute( + "element-id", std::to_string(element_field->get().field_id.value()), false); + node->addCustomAttributesForField(attributes); + } + } + + return ApplyFieldIdsFromNameMapping(node->leafAt(0), name_mapping); + } + + Status ApplyFieldIdsToMap(const ::avro::NodePtr& node, + const NameMapping& name_mapping) { + if (node->leaves() != 2) { + return InvalidSchema("Map type must have exactly two leaves"); + } + + // Try to find key and value field IDs + if (auto key_field = name_mapping.Find("key")) { + if (key_field->get().field_id.has_value()) { + ::avro::CustomAttributes attributes; + attributes.addAttribute("key-id", + std::to_string(key_field->get().field_id.value()), false); + node->addCustomAttributesForField(attributes); + } + } + + if (auto value_field = name_mapping.Find("value")) { + if (value_field->get().field_id.has_value()) { + ::avro::CustomAttributes attributes; + attributes.addAttribute( + "value-id", std::to_string(value_field->get().field_id.value()), false); + node->addCustomAttributesForField(attributes); + } + } + + return ApplyFieldIdsFromNameMapping(node->leafAt(1), name_mapping); + } + + Status ApplyFieldIdsToUnion(const ::avro::NodePtr& node, + const NameMapping& name_mapping) { + if (node->leaves() != 2) { + return InvalidSchema("Union type must have exactly two branches"); + } + + const auto& branch_0 = node->leafAt(0); + const auto& branch_1 = node->leafAt(1); + + if (branch_0->type() == ::avro::AVRO_NULL) { + return ApplyFieldIdsFromNameMapping(branch_1, name_mapping); + } + if (branch_1->type() == ::avro::AVRO_NULL) { + return ApplyFieldIdsFromNameMapping(branch_0, name_mapping); + } + + return InvalidSchema("Union type must have exactly one null branch"); + } + private: // Max number of rows in the record batch to read. int64_t batch_size_{}; diff --git a/src/iceberg/name_mapping.cc b/src/iceberg/name_mapping.cc index ef9a0bb0..eaf6199e 100644 --- a/src/iceberg/name_mapping.cc +++ b/src/iceberg/name_mapping.cc @@ -152,7 +152,7 @@ const std::unordered_map& MappedFields::LazyIdToFi NameMapping::NameMapping(std::unique_ptr mapping) : mapping_(std::move(mapping)) {} -std::optional NameMapping::Find(int32_t id) { +std::optional NameMapping::Find(int32_t id) const { const auto& fields_by_id = LazyFieldsById(); if (auto iter = fields_by_id.find(id); iter != fields_by_id.cend()) { return iter->second; @@ -160,14 +160,15 @@ std::optional NameMapping::Find(int32_t id) { return std::nullopt; } -std::optional NameMapping::Find(std::span names) { +std::optional NameMapping::Find( + std::span names) const { if (names.empty()) { return std::nullopt; } return Find(JoinByDot(names)); } -std::optional NameMapping::Find(const std::string& name) { +std::optional NameMapping::Find(const std::string& name) const { const auto& fields_by_name = LazyFieldsByName(); if (auto iter = fields_by_name.find(name); iter != fields_by_name.cend()) { return iter->second; diff --git a/src/iceberg/name_mapping.h b/src/iceberg/name_mapping.h index 1b7b6f3c..3a66be07 100644 --- a/src/iceberg/name_mapping.h +++ b/src/iceberg/name_mapping.h @@ -104,13 +104,13 @@ class ICEBERG_EXPORT NameMapping { static std::unique_ptr MakeEmpty(); /// \brief Find a field by its ID. - std::optional Find(int32_t id); + std::optional Find(int32_t id) const; /// \brief Find a field by its unconcatenated names. - std::optional Find(std::span names); + std::optional Find(std::span names) const; /// \brief Find a field by its (concatenated) name. - std::optional Find(const std::string& name); + std::optional Find(const std::string& name) const; /// \brief Get the underlying MappedFields instance. const MappedFields& AsMappedFields() const; diff --git a/test/avro_reader_test.cc b/test/avro_reader_test.cc new file mode 100644 index 00000000..4396e58d --- /dev/null +++ b/test/avro_reader_test.cc @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/avro/avro_reader.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/json_internal.h" +#include "iceberg/name_mapping.h" +#include "iceberg/result.h" +#include "iceberg/schema.h" +#include "matchers.h" + +namespace iceberg::avro { + +class AvroReaderTest : public ::testing::Test { + protected: + void SetUp() override { + // Create a simple Avro schema for testing + std::string schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "long"}, + {"name": "name", "type": "string"}, + {"name": "data", "type": { + "type": "record", + "name": "nested_record", + "fields": [ + {"name": "value", "type": "int"}, + {"name": "description", "type": "string"} + ] + }} + ] + })"; + + avro_schema_ = ::avro::compileJsonSchemaFromString(schema_json); + } + + std::unique_ptr CreateTestNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + + // Create nested mapping for the data field + std::vector nested_fields; + nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); + nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); + auto nested_mapping = MappedFields::Make(std::move(nested_fields)); + + fields.emplace_back(MappedField{ + .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + ::avro::ValidSchema avro_schema_; +}; + +TEST_F(AvroReaderTest, ApplyFieldIdsFromNameMappingBasic) { + auto name_mapping = CreateTestNameMapping(); + + // Test that the schema can be processed without errors + // Note: This is a basic test to ensure the function can be called + // In a real implementation, we would need to mock the Avro Node interface + // or create actual Avro schema nodes to test the field ID application + + EXPECT_TRUE(name_mapping != nullptr); + EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); +} + +TEST_F(AvroReaderTest, NameMappingFindMethodsWorkWithConst) { + auto name_mapping = CreateTestNameMapping(); + const NameMapping& const_mapping = *name_mapping; + + // Test that Find methods work on const objects + auto field_by_id = const_mapping.Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); + + auto field_by_name = const_mapping.Find("name"); + ASSERT_TRUE(field_by_name.has_value()); + EXPECT_EQ(field_by_name->get().field_id, 2); + EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); + + auto field_by_parts = const_mapping.Find(std::vector{"data", "value"}); + ASSERT_TRUE(field_by_parts.has_value()); + EXPECT_EQ(field_by_parts->get().field_id, 3); + EXPECT_THAT(field_by_parts->get().names, testing::UnorderedElementsAre("value")); +} + +TEST_F(AvroReaderTest, NameMappingFromJsonWorks) { + std::string json_str = R"([ + {"field-id": 1, "names": ["id"]}, + {"field-id": 2, "names": ["name"]}, + {"field-id": 3, "names": ["data"], "fields": [ + {"field-id": 4, "names": ["value"]}, + {"field-id": 5, "names": ["description"]} + ]} + ])"; + + auto result = iceberg::NameMappingFromJson(nlohmann::json::parse(json_str)); + ASSERT_TRUE(result.has_value()); + + auto mapping = std::move(result.value()); + const NameMapping& const_mapping = *mapping; + + // Test that the parsed mapping works correctly + auto field_by_id = const_mapping.Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); + + auto field_by_name = const_mapping.Find("name"); + ASSERT_TRUE(field_by_name.has_value()); + EXPECT_EQ(field_by_name->get().field_id, 2); + EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); + + auto nested_field = const_mapping.Find("data.value"); + ASSERT_TRUE(nested_field.has_value()); + EXPECT_EQ(nested_field->get().field_id, 4); + EXPECT_THAT(nested_field->get().names, testing::UnorderedElementsAre("value")); +} + +TEST_F(AvroReaderTest, NameMappingWithNestedFields) { + auto name_mapping = CreateTestNameMapping(); + const NameMapping& const_mapping = *name_mapping; + + // Test nested field access + auto data_field = const_mapping.Find("data"); + ASSERT_TRUE(data_field.has_value()); + EXPECT_EQ(data_field->get().field_id, 5); + EXPECT_THAT(data_field->get().names, testing::UnorderedElementsAre("data")); + + // Test that nested mapping exists + ASSERT_TRUE(data_field->get().nested_mapping != nullptr); + EXPECT_EQ(data_field->get().nested_mapping->Size(), 2); + + // Test nested field access through the nested mapping + auto value_field = data_field->get().nested_mapping->Field(3); + ASSERT_TRUE(value_field.has_value()); + EXPECT_EQ(value_field->get().field_id, 3); + EXPECT_THAT(value_field->get().names, testing::UnorderedElementsAre("value")); + + auto description_field = data_field->get().nested_mapping->Field(4); + ASSERT_TRUE(description_field.has_value()); + EXPECT_EQ(description_field->get().field_id, 4); + EXPECT_THAT(description_field->get().names, + testing::UnorderedElementsAre("description")); +} + +} // namespace iceberg::avro diff --git a/test/name_mapping_test.cc b/test/name_mapping_test.cc index 53c9fe00..70f71560 100644 --- a/test/name_mapping_test.cc +++ b/test/name_mapping_test.cc @@ -137,6 +137,58 @@ TEST_F(NameMappingTest, FindByNameParts) { } } +TEST_F(NameMappingTest, FindMethodsOnConstObject) { + auto mapping = MakeNameMapping(); + const NameMapping& const_mapping = *mapping; + + // Test Find by ID on const object + auto field_by_id = const_mapping.Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("foo", "bar")); + + // Test Find by name on const object + auto field_by_name = const_mapping.Find("baz"); + ASSERT_TRUE(field_by_name.has_value()); + EXPECT_EQ(field_by_name->get().field_id, 2); + EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("baz")); + + // Test Find by name parts on const object + auto field_by_parts = const_mapping.Find(std::vector{"qux", "hello"}); + ASSERT_TRUE(field_by_parts.has_value()); + EXPECT_EQ(field_by_parts->get().field_id, 4); + EXPECT_THAT(field_by_parts->get().names, testing::UnorderedElementsAre("hello")); + + // Test Find non-existent field on const object + auto non_existent = const_mapping.Find(999); + EXPECT_FALSE(non_existent.has_value()); + + auto non_existent_name = const_mapping.Find("non_existent"); + EXPECT_FALSE(non_existent_name.has_value()); + + auto non_existent_parts = + const_mapping.Find(std::vector{"non", "existent"}); + EXPECT_FALSE(non_existent_parts.has_value()); +} + +TEST_F(NameMappingTest, FindMethodsOnConstEmptyMapping) { + auto empty_mapping = NameMapping::MakeEmpty(); + const NameMapping& const_empty_mapping = *empty_mapping; + + // Test Find by ID on const empty mapping + auto field_by_id = const_empty_mapping.Find(1); + EXPECT_FALSE(field_by_id.has_value()); + + // Test Find by name on const empty mapping + auto field_by_name = const_empty_mapping.Find("test"); + EXPECT_FALSE(field_by_name.has_value()); + + // Test Find by name parts on const empty mapping + auto field_by_parts = + const_empty_mapping.Find(std::vector{"test", "field"}); + EXPECT_FALSE(field_by_parts.has_value()); +} + TEST_F(NameMappingTest, Equality) { auto mapping1 = MakeNameMapping(); auto mapping2 = MakeNameMapping(); From 79bb90e143b696a982699885db0830ad9a640661 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Sun, 27 Jul 2025 09:05:38 +0800 Subject: [PATCH 02/15] fix build error and set name_mapping to ReaderOptions --- src/iceberg/avro/avro_constants.h | 40 ++++++++++++++ src/iceberg/avro/avro_reader.cc | 91 ++++++++++++++++--------------- src/iceberg/file_reader.h | 4 ++ test/avro_reader_test.cc | 21 +++++++ 4 files changed, 112 insertions(+), 44 deletions(-) create mode 100644 src/iceberg/avro/avro_constants.h diff --git a/src/iceberg/avro/avro_constants.h b/src/iceberg/avro/avro_constants.h new file mode 100644 index 00000000..7ad44f83 --- /dev/null +++ b/src/iceberg/avro/avro_constants.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg::avro { + +// Avro field ID attribute constants +constexpr std::string_view kFieldId = "field-id"; +constexpr std::string_view kElementId = "element-id"; +constexpr std::string_view kKeyId = "key-id"; +constexpr std::string_view kValueId = "value-id"; + +// Avro logical type constants +constexpr std::string_view kMapLogicalType = "map"; + +// Name mapping field constants +constexpr std::string_view kElement = "element"; +constexpr std::string_view kKey = "key"; +constexpr std::string_view kValue = "value"; + +} // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index a4b3e2c9..6a2b310a 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -36,7 +36,6 @@ #include "iceberg/avro/avro_data_util_internal.h" #include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/avro/avro_stream_internal.h" -#include "iceberg/json_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/schema_internal.h" #include "iceberg/util/checked_cast.h" @@ -107,14 +106,9 @@ class AvroReader::Impl { if (has_id_visitor.HasNoIds()) { // Apply field IDs based on name mapping if available - auto name_mapping_iter = options.properties.find("name_mapping"); - if (name_mapping_iter != options.properties.end()) { - // Parse name mapping from JSON string - ICEBERG_ASSIGN_OR_RAISE(auto name_mapping, - iceberg::NameMappingFromJson( - nlohmann::json::parse(name_mapping_iter->second))); - ICEBERG_RETURN_UNEXPECTED( - ApplyFieldIdsFromNameMapping(file_schema.root(), *name_mapping)); + if (options.name_mapping) { + ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping, + file_schema.root().get())); } else { return NotImplemented( "Avro file schema has no field IDs and no name mapping provided"); @@ -227,8 +221,8 @@ class AvroReader::Impl { } // Apply field IDs to Avro schema nodes based on name mapping - Status ApplyFieldIdsFromNameMapping(const ::avro::NodePtr& node, - const NameMapping& name_mapping) { + Status ApplyFieldIdsFromNameMapping(const NameMapping& name_mapping, + ::avro::Node* node) { switch (node->type()) { case ::avro::AVRO_RECORD: return ApplyFieldIdsToRecord(node, name_mapping); @@ -255,19 +249,19 @@ class AvroReader::Impl { } } - Status ApplyFieldIdsToRecord(const ::avro::NodePtr& node, - const NameMapping& name_mapping) { + Status ApplyFieldIdsToRecord(::avro::Node* node, const NameMapping& name_mapping) { for (size_t i = 0; i < node->leaves(); ++i) { const std::string& field_name = node->nameAt(i); - ::avro::NodePtr field_node = node->leafAt(i); + ::avro::Node* field_node = node->leafAt(i).get(); // Try to find field ID by name in the name mapping if (auto field_ref = name_mapping.Find(field_name)) { if (field_ref->get().field_id.has_value()) { // Add field ID attribute to the node ::avro::CustomAttributes attributes; - attributes.addAttribute( - "field-id", std::to_string(field_ref->get().field_id.value()), false); + attributes.addAttribute(std::string(kFieldId), + std::to_string(field_ref->get().field_id.value()), + false); node->addCustomAttributesForField(attributes); } @@ -279,23 +273,24 @@ class AvroReader::Impl { std::vector fields_vector(fields_span.begin(), fields_span.end()); auto nested_name_mapping = NameMapping::Make(std::move(fields_vector)); ICEBERG_RETURN_UNEXPECTED( - ApplyFieldIdsFromNameMapping(field_node, *nested_name_mapping)); + ApplyFieldIdsFromNameMapping(*nested_name_mapping, field_node)); } else { // Recursively apply field IDs to child nodes (only if not already handled by // nested mapping) ICEBERG_RETURN_UNEXPECTED( - ApplyFieldIdsFromNameMapping(field_node, name_mapping)); + ApplyFieldIdsFromNameMapping(name_mapping, field_node)); } } else { // Recursively apply field IDs to child nodes even if no mapping found - ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(field_node, name_mapping)); + ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(name_mapping, field_node)); } } return {}; } - Status ApplyFieldIdsToArray(const ::avro::NodePtr& node, - const NameMapping& name_mapping) { + Status ApplyFieldIdsToArray(::avro::Node* node, const NameMapping& name_mapping) { + // TODO(liuxiaoyu): Add debug logging to print node information for troubleshooting + // when array type validation fails if (node->leaves() != 1) { return InvalidSchema("Array type must have exactly one leaf"); } @@ -303,53 +298,53 @@ class AvroReader::Impl { // Check if this is a map represented as array if (node->logicalType().type() == ::avro::LogicalType::CUSTOM && node->logicalType().customLogicalType() != nullptr && - node->logicalType().customLogicalType()->name() == "map") { - return ApplyFieldIdsFromNameMapping(node->leafAt(0), name_mapping); + node->logicalType().customLogicalType()->name() == kMapLogicalType) { + return ApplyFieldIdsFromNameMapping(name_mapping, node->leafAt(0).get()); } // For regular arrays, try to find element field ID - if (auto element_field = name_mapping.Find("element")) { + if (auto element_field = name_mapping.Find(std::string(kElement))) { if (element_field->get().field_id.has_value()) { ::avro::CustomAttributes attributes; - attributes.addAttribute( - "element-id", std::to_string(element_field->get().field_id.value()), false); + attributes.addAttribute(std::string(kElementId), + std::to_string(element_field->get().field_id.value()), + false); node->addCustomAttributesForField(attributes); } } - return ApplyFieldIdsFromNameMapping(node->leafAt(0), name_mapping); + return ApplyFieldIdsFromNameMapping(name_mapping, node->leafAt(0).get()); } - Status ApplyFieldIdsToMap(const ::avro::NodePtr& node, - const NameMapping& name_mapping) { + Status ApplyFieldIdsToMap(::avro::Node* node, const NameMapping& name_mapping) { if (node->leaves() != 2) { return InvalidSchema("Map type must have exactly two leaves"); } // Try to find key and value field IDs - if (auto key_field = name_mapping.Find("key")) { + if (auto key_field = name_mapping.Find(std::string(kKey))) { if (key_field->get().field_id.has_value()) { ::avro::CustomAttributes attributes; - attributes.addAttribute("key-id", + attributes.addAttribute(std::string(kKeyId), std::to_string(key_field->get().field_id.value()), false); node->addCustomAttributesForField(attributes); } } - if (auto value_field = name_mapping.Find("value")) { + if (auto value_field = name_mapping.Find(std::string(kValue))) { if (value_field->get().field_id.has_value()) { ::avro::CustomAttributes attributes; - attributes.addAttribute( - "value-id", std::to_string(value_field->get().field_id.value()), false); + attributes.addAttribute(std::string(kValueId), + std::to_string(value_field->get().field_id.value()), + false); node->addCustomAttributesForField(attributes); } } - return ApplyFieldIdsFromNameMapping(node->leafAt(1), name_mapping); + return ApplyFieldIdsFromNameMapping(name_mapping, node->leafAt(1).get()); } - Status ApplyFieldIdsToUnion(const ::avro::NodePtr& node, - const NameMapping& name_mapping) { + Status ApplyFieldIdsToUnion(::avro::Node* node, const NameMapping& name_mapping) { if (node->leaves() != 2) { return InvalidSchema("Union type must have exactly two branches"); } @@ -357,14 +352,22 @@ class AvroReader::Impl { const auto& branch_0 = node->leafAt(0); const auto& branch_1 = node->leafAt(1); - if (branch_0->type() == ::avro::AVRO_NULL) { - return ApplyFieldIdsFromNameMapping(branch_1, name_mapping); + bool branch_0_is_null = (branch_0->type() == ::avro::AVRO_NULL); + bool branch_1_is_null = (branch_1->type() == ::avro::AVRO_NULL); + + if (branch_0_is_null && !branch_1_is_null) { + // branch_0 is null, branch_1 is not null + return ApplyFieldIdsFromNameMapping(name_mapping, branch_1.get()); + } else if (!branch_0_is_null && branch_1_is_null) { + // branch_0 is not null, branch_1 is null + return ApplyFieldIdsFromNameMapping(name_mapping, branch_0.get()); + } else if (branch_0_is_null && branch_1_is_null) { + // Both branches are null - this is invalid + return InvalidSchema("Union type cannot have two null branches"); + } else { + // Neither branch is null - this is invalid + return InvalidSchema("Union type must have exactly one null branch"); } - if (branch_1->type() == ::avro::AVRO_NULL) { - return ApplyFieldIdsFromNameMapping(branch_0, name_mapping); - } - - return InvalidSchema("Union type must have exactly one null branch"); } private: diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h index 6fc75225..faf6e28f 100644 --- a/src/iceberg/file_reader.h +++ b/src/iceberg/file_reader.h @@ -28,6 +28,7 @@ #include "iceberg/arrow_c_data.h" #include "iceberg/file_format.h" +#include "iceberg/name_mapping.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" @@ -86,6 +87,9 @@ struct ICEBERG_EXPORT ReaderOptions { /// \brief The filter to apply to the data. Reader implementations may ignore this if /// the file format does not support filtering. std::shared_ptr filter; + /// \brief Name mapping for schema evolution compatibility. Used when reading files + /// that may have different field names than the current schema. + std::shared_ptr name_mapping; /// \brief Format-specific or implementation-specific properties. std::unordered_map properties; }; diff --git a/test/avro_reader_test.cc b/test/avro_reader_test.cc index 4396e58d..0586bfcf 100644 --- a/test/avro_reader_test.cc +++ b/test/avro_reader_test.cc @@ -173,4 +173,25 @@ TEST_F(AvroReaderTest, NameMappingWithNestedFields) { testing::UnorderedElementsAre("description")); } +TEST_F(AvroReaderTest, NameMappingFromReaderOptionsWorks) { + // Create a name mapping + auto name_mapping = CreateTestNameMapping(); + ASSERT_TRUE(name_mapping != nullptr); + EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); + + // Create reader options with name mapping + ReaderOptions options; + options.name_mapping = std::move(name_mapping); + + // Verify that the name mapping is accessible + ASSERT_TRUE(options.name_mapping != nullptr); + EXPECT_EQ(options.name_mapping->AsMappedFields().Size(), 3); + + // Test that the name mapping works correctly + auto field_by_id = options.name_mapping->Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); +} + } // namespace iceberg::avro From 0bc3f5a69f7159c0d72ae14a09c9db910292d837 Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 30 Jun 2025 10:20:35 +0800 Subject: [PATCH 03/15] Update src/iceberg/file_reader.h Co-authored-by: Gang Wu --- src/iceberg/file_reader.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/iceberg/file_reader.h b/src/iceberg/file_reader.h index faf6e28f..8a59e33f 100644 --- a/src/iceberg/file_reader.h +++ b/src/iceberg/file_reader.h @@ -28,7 +28,6 @@ #include "iceberg/arrow_c_data.h" #include "iceberg/file_format.h" -#include "iceberg/name_mapping.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" From 31cc9973f5a79544f49be7d738c8bcfdf268531d Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 30 Jun 2025 10:20:42 +0800 Subject: [PATCH 04/15] Update src/iceberg/avro/avro_reader.cc Co-authored-by: Gang Wu --- src/iceberg/avro/avro_reader.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 6a2b310a..b42b13e5 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -30,7 +30,6 @@ #include #include #include -#include #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/avro/avro_data_util_internal.h" From 2f2e4fdac075ec9f4e1acf3c05171e66b02684f0 Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 30 Jun 2025 10:20:52 +0800 Subject: [PATCH 05/15] Update src/iceberg/avro/avro_reader.cc Co-authored-by: Gang Wu --- src/iceberg/avro/avro_reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index b42b13e5..2da700d7 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -109,7 +109,7 @@ class AvroReader::Impl { ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping, file_schema.root().get())); } else { - return NotImplemented( + return InvalidSchema( "Avro file schema has no field IDs and no name mapping provided"); } } else if (!has_id_visitor.AllHaveIds()) { From bb91f9d16458370a97bbcd059ab5b3b46ae78137 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Sun, 27 Jul 2025 09:06:14 +0800 Subject: [PATCH 06/15] Refactor the applying field-ids on name mapping code. --- src/iceberg/avro/avro_reader.cc | 173 ++------------- src/iceberg/avro/avro_schema_util.cc | 199 ++++++++++++++++++ src/iceberg/avro/avro_schema_util_internal.h | 36 ++++ .../avro/{avro_constants.h => constants.h} | 6 - test/avro_reader_test.cc | 197 ----------------- test/avro_schema_test.cc | 137 ++++++++++++ 6 files changed, 392 insertions(+), 356 deletions(-) rename src/iceberg/avro/{avro_constants.h => constants.h} (82%) delete mode 100644 test/avro_reader_test.cc diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 2da700d7..3a823e85 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -97,7 +97,7 @@ class AvroReader::Impl { // Create a base reader without setting reader schema to enable projection. auto base_reader = std::make_unique<::avro::DataFileReaderBase>(std::move(input_stream)); - const ::avro::ValidSchema& file_schema = base_reader->dataSchema(); + ::avro::ValidSchema file_schema = base_reader->dataSchema(); // Validate field ids in the file schema. HasIdVisitor has_id_visitor; @@ -106,8 +106,25 @@ class AvroReader::Impl { if (has_id_visitor.HasNoIds()) { // Apply field IDs based on name mapping if available if (options.name_mapping) { - ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(*options.name_mapping, - file_schema.root().get())); + ICEBERG_ASSIGN_OR_RAISE( + auto new_root_node, + CreateAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); + + // Create a new schema with the updated root node + auto new_schema = ::avro::ValidSchema(new_root_node); + + // Verify that all fields now have IDs after applying the name mapping + HasIdVisitor verify_visitor; + ICEBERG_RETURN_UNEXPECTED(verify_visitor.Visit(new_schema)); + if (!verify_visitor.AllHaveIds()) { + // TODO(liuxiaoyu): Print detailed error message with missing field IDs + // information in future + return InvalidSchema( + "Not all fields have field IDs after applying name mapping."); + } + + // Update the file schema to use the new schema with field IDs + file_schema = new_schema; } else { return InvalidSchema( "Avro file schema has no field IDs and no name mapping provided"); @@ -219,156 +236,6 @@ class AvroReader::Impl { return arrow_array; } - // Apply field IDs to Avro schema nodes based on name mapping - Status ApplyFieldIdsFromNameMapping(const NameMapping& name_mapping, - ::avro::Node* node) { - switch (node->type()) { - case ::avro::AVRO_RECORD: - return ApplyFieldIdsToRecord(node, name_mapping); - case ::avro::AVRO_ARRAY: - return ApplyFieldIdsToArray(node, name_mapping); - case ::avro::AVRO_MAP: - return ApplyFieldIdsToMap(node, name_mapping); - case ::avro::AVRO_UNION: - return ApplyFieldIdsToUnion(node, name_mapping); - case ::avro::AVRO_BOOL: - case ::avro::AVRO_INT: - case ::avro::AVRO_LONG: - case ::avro::AVRO_FLOAT: - case ::avro::AVRO_DOUBLE: - case ::avro::AVRO_STRING: - case ::avro::AVRO_BYTES: - case ::avro::AVRO_FIXED: - return {}; - case ::avro::AVRO_NULL: - case ::avro::AVRO_ENUM: - default: - return InvalidSchema("Unsupported Avro type for field ID application: {}", - static_cast(node->type())); - } - } - - Status ApplyFieldIdsToRecord(::avro::Node* node, const NameMapping& name_mapping) { - for (size_t i = 0; i < node->leaves(); ++i) { - const std::string& field_name = node->nameAt(i); - ::avro::Node* field_node = node->leafAt(i).get(); - - // Try to find field ID by name in the name mapping - if (auto field_ref = name_mapping.Find(field_name)) { - if (field_ref->get().field_id.has_value()) { - // Add field ID attribute to the node - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kFieldId), - std::to_string(field_ref->get().field_id.value()), - false); - node->addCustomAttributesForField(attributes); - } - - // Recursively apply field IDs to nested fields if they exist - if (field_ref->get().nested_mapping && - field_node->type() == ::avro::AVRO_RECORD) { - const auto& nested_mapping = field_ref->get().nested_mapping; - auto fields_span = nested_mapping->fields(); - std::vector fields_vector(fields_span.begin(), fields_span.end()); - auto nested_name_mapping = NameMapping::Make(std::move(fields_vector)); - ICEBERG_RETURN_UNEXPECTED( - ApplyFieldIdsFromNameMapping(*nested_name_mapping, field_node)); - } else { - // Recursively apply field IDs to child nodes (only if not already handled by - // nested mapping) - ICEBERG_RETURN_UNEXPECTED( - ApplyFieldIdsFromNameMapping(name_mapping, field_node)); - } - } else { - // Recursively apply field IDs to child nodes even if no mapping found - ICEBERG_RETURN_UNEXPECTED(ApplyFieldIdsFromNameMapping(name_mapping, field_node)); - } - } - return {}; - } - - Status ApplyFieldIdsToArray(::avro::Node* node, const NameMapping& name_mapping) { - // TODO(liuxiaoyu): Add debug logging to print node information for troubleshooting - // when array type validation fails - if (node->leaves() != 1) { - return InvalidSchema("Array type must have exactly one leaf"); - } - - // Check if this is a map represented as array - if (node->logicalType().type() == ::avro::LogicalType::CUSTOM && - node->logicalType().customLogicalType() != nullptr && - node->logicalType().customLogicalType()->name() == kMapLogicalType) { - return ApplyFieldIdsFromNameMapping(name_mapping, node->leafAt(0).get()); - } - - // For regular arrays, try to find element field ID - if (auto element_field = name_mapping.Find(std::string(kElement))) { - if (element_field->get().field_id.has_value()) { - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kElementId), - std::to_string(element_field->get().field_id.value()), - false); - node->addCustomAttributesForField(attributes); - } - } - - return ApplyFieldIdsFromNameMapping(name_mapping, node->leafAt(0).get()); - } - - Status ApplyFieldIdsToMap(::avro::Node* node, const NameMapping& name_mapping) { - if (node->leaves() != 2) { - return InvalidSchema("Map type must have exactly two leaves"); - } - - // Try to find key and value field IDs - if (auto key_field = name_mapping.Find(std::string(kKey))) { - if (key_field->get().field_id.has_value()) { - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kKeyId), - std::to_string(key_field->get().field_id.value()), false); - node->addCustomAttributesForField(attributes); - } - } - - if (auto value_field = name_mapping.Find(std::string(kValue))) { - if (value_field->get().field_id.has_value()) { - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kValueId), - std::to_string(value_field->get().field_id.value()), - false); - node->addCustomAttributesForField(attributes); - } - } - - return ApplyFieldIdsFromNameMapping(name_mapping, node->leafAt(1).get()); - } - - Status ApplyFieldIdsToUnion(::avro::Node* node, const NameMapping& name_mapping) { - if (node->leaves() != 2) { - return InvalidSchema("Union type must have exactly two branches"); - } - - const auto& branch_0 = node->leafAt(0); - const auto& branch_1 = node->leafAt(1); - - bool branch_0_is_null = (branch_0->type() == ::avro::AVRO_NULL); - bool branch_1_is_null = (branch_1->type() == ::avro::AVRO_NULL); - - if (branch_0_is_null && !branch_1_is_null) { - // branch_0 is null, branch_1 is not null - return ApplyFieldIdsFromNameMapping(name_mapping, branch_1.get()); - } else if (!branch_0_is_null && branch_1_is_null) { - // branch_0 is not null, branch_1 is null - return ApplyFieldIdsFromNameMapping(name_mapping, branch_0.get()); - } else if (branch_0_is_null && branch_1_is_null) { - // Both branches are null - this is invalid - return InvalidSchema("Union type cannot have two null branches"); - } else { - // Neither branch is null - this is invalid - return InvalidSchema("Union type must have exactly one null branch"); - } - } - private: // Max number of rows in the record batch to read. int64_t batch_size_{}; diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 28e9e7bf..73a2cc38 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -31,7 +31,9 @@ #include "iceberg/avro/avro_register.h" #include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/avro/constants.h" #include "iceberg/metadata_columns.h" +#include "iceberg/name_mapping.h" #include "iceberg/schema.h" #include "iceberg/schema_util_internal.h" #include "iceberg/util/formatter.h" @@ -773,4 +775,201 @@ Result Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +// Helper function to create a new Avro node with field IDs from name mapping +Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping) { + switch (original_node->type()) { + case ::avro::AVRO_RECORD: + return CreateRecordNodeWithFieldIds(original_node, name_mapping); + case ::avro::AVRO_ARRAY: + return CreateArrayNodeWithFieldIds(original_node, name_mapping); + case ::avro::AVRO_MAP: + return CreateMapNodeWithFieldIds(original_node, name_mapping); + case ::avro::AVRO_UNION: + return CreateUnionNodeWithFieldIds(original_node, name_mapping); + case ::avro::AVRO_BOOL: + case ::avro::AVRO_INT: + case ::avro::AVRO_LONG: + case ::avro::AVRO_FLOAT: + case ::avro::AVRO_DOUBLE: + case ::avro::AVRO_STRING: + case ::avro::AVRO_BYTES: + case ::avro::AVRO_FIXED: + // For primitive types, just return a copy + return original_node; + case ::avro::AVRO_NULL: + case ::avro::AVRO_ENUM: + default: + return InvalidSchema("Unsupported Avro type for field ID application: {}", + ToString(original_node)); + } +} + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // Try to find field ID by name in the name mapping + if (auto field_ref = name_mapping.Find(field_name)) { + if (field_ref->get().field_id.has_value()) { + // Add field ID attribute to the new node + ::avro::CustomAttributes attributes; + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(field_ref->get().field_id.value()), false); + new_record_node->addCustomAttributesForField(attributes); + } + + // Recursively apply field IDs to nested fields if they exist + if (field_ref->get().nested_mapping && field_node->type() == ::avro::AVRO_RECORD) { + const auto& nested_mapping = field_ref->get().nested_mapping; + auto fields_span = nested_mapping->fields(); + std::vector fields_vector(fields_span.begin(), fields_span.end()); + auto nested_name_mapping = NameMapping::Make(std::move(fields_vector)); + + ICEBERG_ASSIGN_OR_RAISE( + auto new_nested_node, + CreateAvroNodeWithFieldIds(field_node, *nested_name_mapping)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); + } else { + // Recursively apply field IDs to child nodes + ICEBERG_ASSIGN_OR_RAISE(auto new_field_node, + CreateAvroNodeWithFieldIds(field_node, name_mapping)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_field_node); + } + } else { + // Recursively apply field IDs to child nodes even if no mapping found + ICEBERG_ASSIGN_OR_RAISE(auto new_field_node, + CreateAvroNodeWithFieldIds(field_node, name_mapping)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_field_node); + } + } + + return new_record_node; +} + +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping) { + if (original_node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + auto new_array_node = std::make_shared<::avro::NodeArray>(); + new_array_node->setName(original_node->name()); + new_array_node->setLogicalType(original_node->logicalType()); + + // Check if this is a map represented as array + if (original_node->logicalType().type() == ::avro::LogicalType::CUSTOM && + original_node->logicalType().customLogicalType() != nullptr && + original_node->logicalType().customLogicalType()->name() == kMapLogicalType) { + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + CreateAvroNodeWithFieldIds(original_node->leafAt(0), name_mapping)); + new_array_node->addLeaf(new_element_node); + return new_array_node; + } + + // For regular arrays, try to find element field ID + if (auto element_field = name_mapping.Find(std::string(kElement))) { + if (element_field->get().field_id.has_value()) { + ::avro::CustomAttributes attributes; + attributes.addAttribute(std::string(kElementIdProp), + std::to_string(element_field->get().field_id.value()), + false); + new_array_node->addCustomAttributesForField(attributes); + } + } + + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + CreateAvroNodeWithFieldIds(original_node->leafAt(0), name_mapping)); + new_array_node->addLeaf(new_element_node); + return new_array_node; +} + +Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping) { + if (original_node->leaves() != 2) { + return InvalidSchema("Map type must have exactly two leaves"); + } + + auto new_map_node = std::make_shared<::avro::NodeMap>(); + new_map_node->setName(original_node->name()); + new_map_node->setLogicalType(original_node->logicalType()); + + // Try to find key and value field IDs + if (auto key_field = name_mapping.Find(std::string(kKey))) { + if (key_field->get().field_id.has_value()) { + ::avro::CustomAttributes attributes; + attributes.addAttribute(std::string(kKeyIdProp), + std::to_string(key_field->get().field_id.value()), false); + new_map_node->addCustomAttributesForField(attributes); + } + } + + if (auto value_field = name_mapping.Find(std::string(kValue))) { + if (value_field->get().field_id.has_value()) { + ::avro::CustomAttributes attributes; + attributes.addAttribute(std::string(kValueIdProp), + std::to_string(value_field->get().field_id.value()), false); + new_map_node->addCustomAttributesForField(attributes); + } + } + + // Add key and value nodes + ICEBERG_ASSIGN_OR_RAISE(auto new_key_node, CreateAvroNodeWithFieldIds( + original_node->leafAt(0), name_mapping)); + ICEBERG_ASSIGN_OR_RAISE( + auto new_value_node, + CreateAvroNodeWithFieldIds(original_node->leafAt(1), name_mapping)); + new_map_node->addLeaf(new_key_node); + new_map_node->addLeaf(new_value_node); + + return new_map_node; +} + +Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping) { + if (original_node->leaves() != 2) { + return InvalidSchema("Union type must have exactly two branches"); + } + + const auto& branch_0 = original_node->leafAt(0); + const auto& branch_1 = original_node->leafAt(1); + + bool branch_0_is_null = (branch_0->type() == ::avro::AVRO_NULL); + bool branch_1_is_null = (branch_1->type() == ::avro::AVRO_NULL); + + if (branch_0_is_null && !branch_1_is_null) { + // branch_0 is null, branch_1 is not null + ICEBERG_ASSIGN_OR_RAISE(auto new_branch_1, + CreateAvroNodeWithFieldIds(branch_1, name_mapping)); + auto new_union_node = std::make_shared<::avro::NodeUnion>(); + new_union_node->addLeaf(branch_0); // null branch + new_union_node->addLeaf(new_branch_1); + return new_union_node; + } else if (!branch_0_is_null && branch_1_is_null) { + // branch_0 is not null, branch_1 is null + ICEBERG_ASSIGN_OR_RAISE(auto new_branch_0, + CreateAvroNodeWithFieldIds(branch_0, name_mapping)); + auto new_union_node = std::make_shared<::avro::NodeUnion>(); + new_union_node->addLeaf(new_branch_0); + new_union_node->addLeaf(branch_1); // null branch + return new_union_node; + } else if (branch_0_is_null && branch_1_is_null) { + // Both branches are null - this is invalid + return InvalidSchema("Union type cannot have two null branches"); + } else { + // Neither branch is null - this is invalid + return InvalidSchema("Union type must have exactly one null branch"); + } +} + } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index de3922a2..7e6644f4 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -23,6 +23,7 @@ #include +#include "iceberg/name_mapping.h" #include "iceberg/result.h" #include "iceberg/schema_util.h" #include "iceberg/type.h" @@ -148,4 +149,39 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type); /// \return True if the node has a map logical type, false otherwise. bool HasMapLogicalType(const ::avro::NodePtr& node); +/// \brief Create a new Avro node with field IDs from name mapping. +/// \param original_node The original Avro node to copy. +/// \param name_mapping The name mapping to apply field IDs from. +/// \return A new Avro node with field IDs applied, or an error. +Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping); + +/// \brief Create a new Avro record node with field IDs from name mapping. +/// \param original_node The original Avro record node to copy. +/// \param name_mapping The name mapping to apply field IDs from. +/// \return A new Avro record node with field IDs applied, or an error. +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping); + +/// \brief Create a new Avro array node with field IDs from name mapping. +/// \param original_node The original Avro array node to copy. +/// \param name_mapping The name mapping to apply field IDs from. +/// \return A new Avro array node with field IDs applied, or an error. +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping); + +/// \brief Create a new Avro map node with field IDs from name mapping. +/// \param original_node The original Avro map node to copy. +/// \param name_mapping The name mapping to apply field IDs from. +/// \return A new Avro map node with field IDs applied, or an error. +Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping); + +/// \brief Create a new Avro union node with field IDs from name mapping. +/// \param original_node The original Avro union node to copy. +/// \param name_mapping The name mapping to apply field IDs from. +/// \return A new Avro union node with field IDs applied, or an error. +Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& name_mapping); + } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_constants.h b/src/iceberg/avro/constants.h similarity index 82% rename from src/iceberg/avro/avro_constants.h rename to src/iceberg/avro/constants.h index 7ad44f83..b56cc666 100644 --- a/src/iceberg/avro/avro_constants.h +++ b/src/iceberg/avro/constants.h @@ -23,12 +23,6 @@ namespace iceberg::avro { -// Avro field ID attribute constants -constexpr std::string_view kFieldId = "field-id"; -constexpr std::string_view kElementId = "element-id"; -constexpr std::string_view kKeyId = "key-id"; -constexpr std::string_view kValueId = "value-id"; - // Avro logical type constants constexpr std::string_view kMapLogicalType = "map"; diff --git a/test/avro_reader_test.cc b/test/avro_reader_test.cc deleted file mode 100644 index 0586bfcf..00000000 --- a/test/avro_reader_test.cc +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "iceberg/avro/avro_reader.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "iceberg/json_internal.h" -#include "iceberg/name_mapping.h" -#include "iceberg/result.h" -#include "iceberg/schema.h" -#include "matchers.h" - -namespace iceberg::avro { - -class AvroReaderTest : public ::testing::Test { - protected: - void SetUp() override { - // Create a simple Avro schema for testing - std::string schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "long"}, - {"name": "name", "type": "string"}, - {"name": "data", "type": { - "type": "record", - "name": "nested_record", - "fields": [ - {"name": "value", "type": "int"}, - {"name": "description", "type": "string"} - ] - }} - ] - })"; - - avro_schema_ = ::avro::compileJsonSchemaFromString(schema_json); - } - - std::unique_ptr CreateTestNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); - - // Create nested mapping for the data field - std::vector nested_fields; - nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); - nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); - auto nested_mapping = MappedFields::Make(std::move(nested_fields)); - - fields.emplace_back(MappedField{ - .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); - - return NameMapping::Make(std::move(fields)); - } - - ::avro::ValidSchema avro_schema_; -}; - -TEST_F(AvroReaderTest, ApplyFieldIdsFromNameMappingBasic) { - auto name_mapping = CreateTestNameMapping(); - - // Test that the schema can be processed without errors - // Note: This is a basic test to ensure the function can be called - // In a real implementation, we would need to mock the Avro Node interface - // or create actual Avro schema nodes to test the field ID application - - EXPECT_TRUE(name_mapping != nullptr); - EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); -} - -TEST_F(AvroReaderTest, NameMappingFindMethodsWorkWithConst) { - auto name_mapping = CreateTestNameMapping(); - const NameMapping& const_mapping = *name_mapping; - - // Test that Find methods work on const objects - auto field_by_id = const_mapping.Find(1); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ(field_by_id->get().field_id, 1); - EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); - - auto field_by_name = const_mapping.Find("name"); - ASSERT_TRUE(field_by_name.has_value()); - EXPECT_EQ(field_by_name->get().field_id, 2); - EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); - - auto field_by_parts = const_mapping.Find(std::vector{"data", "value"}); - ASSERT_TRUE(field_by_parts.has_value()); - EXPECT_EQ(field_by_parts->get().field_id, 3); - EXPECT_THAT(field_by_parts->get().names, testing::UnorderedElementsAre("value")); -} - -TEST_F(AvroReaderTest, NameMappingFromJsonWorks) { - std::string json_str = R"([ - {"field-id": 1, "names": ["id"]}, - {"field-id": 2, "names": ["name"]}, - {"field-id": 3, "names": ["data"], "fields": [ - {"field-id": 4, "names": ["value"]}, - {"field-id": 5, "names": ["description"]} - ]} - ])"; - - auto result = iceberg::NameMappingFromJson(nlohmann::json::parse(json_str)); - ASSERT_TRUE(result.has_value()); - - auto mapping = std::move(result.value()); - const NameMapping& const_mapping = *mapping; - - // Test that the parsed mapping works correctly - auto field_by_id = const_mapping.Find(1); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ(field_by_id->get().field_id, 1); - EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); - - auto field_by_name = const_mapping.Find("name"); - ASSERT_TRUE(field_by_name.has_value()); - EXPECT_EQ(field_by_name->get().field_id, 2); - EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); - - auto nested_field = const_mapping.Find("data.value"); - ASSERT_TRUE(nested_field.has_value()); - EXPECT_EQ(nested_field->get().field_id, 4); - EXPECT_THAT(nested_field->get().names, testing::UnorderedElementsAre("value")); -} - -TEST_F(AvroReaderTest, NameMappingWithNestedFields) { - auto name_mapping = CreateTestNameMapping(); - const NameMapping& const_mapping = *name_mapping; - - // Test nested field access - auto data_field = const_mapping.Find("data"); - ASSERT_TRUE(data_field.has_value()); - EXPECT_EQ(data_field->get().field_id, 5); - EXPECT_THAT(data_field->get().names, testing::UnorderedElementsAre("data")); - - // Test that nested mapping exists - ASSERT_TRUE(data_field->get().nested_mapping != nullptr); - EXPECT_EQ(data_field->get().nested_mapping->Size(), 2); - - // Test nested field access through the nested mapping - auto value_field = data_field->get().nested_mapping->Field(3); - ASSERT_TRUE(value_field.has_value()); - EXPECT_EQ(value_field->get().field_id, 3); - EXPECT_THAT(value_field->get().names, testing::UnorderedElementsAre("value")); - - auto description_field = data_field->get().nested_mapping->Field(4); - ASSERT_TRUE(description_field.has_value()); - EXPECT_EQ(description_field->get().field_id, 4); - EXPECT_THAT(description_field->get().names, - testing::UnorderedElementsAre("description")); -} - -TEST_F(AvroReaderTest, NameMappingFromReaderOptionsWorks) { - // Create a name mapping - auto name_mapping = CreateTestNameMapping(); - ASSERT_TRUE(name_mapping != nullptr); - EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); - - // Create reader options with name mapping - ReaderOptions options; - options.name_mapping = std::move(name_mapping); - - // Verify that the name mapping is accessible - ASSERT_TRUE(options.name_mapping != nullptr); - EXPECT_EQ(options.name_mapping->AsMappedFields().Size(), 3); - - // Test that the name mapping works correctly - auto field_by_id = options.name_mapping->Find(1); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ(field_by_id->get().field_id, 1); - EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); -} - -} // namespace iceberg::avro diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index 875bfe95..cc24d724 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -22,10 +22,15 @@ #include #include #include +#include #include +#include +#include "iceberg/avro/avro_reader.h" #include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/json_internal.h" #include "iceberg/metadata_columns.h" +#include "iceberg/name_mapping.h" #include "iceberg/schema.h" #include "matchers.h" @@ -1058,3 +1063,135 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { } } // namespace iceberg::avro + +// NameMapping tests for Avro schema context +namespace iceberg::avro { + +std::unique_ptr CreateTestNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + + // Create nested mapping for the data field + std::vector nested_fields; + nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); + nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); + auto nested_mapping = MappedFields::Make(std::move(nested_fields)); + + fields.emplace_back(MappedField{ + .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); + + return NameMapping::Make(std::move(fields)); +} + +TEST(NameMappingBasicTest, Basic) { + auto name_mapping = CreateTestNameMapping(); + + // Test that the name mapping can be created and accessed + EXPECT_TRUE(name_mapping != nullptr); + EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); +} + +TEST(NameMappingFindMethodsTest, FindMethodsWorkWithConst) { + auto name_mapping = CreateTestNameMapping(); + const NameMapping& const_mapping = *name_mapping; + + // Test that Find methods work on const objects + auto field_by_id = const_mapping.Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); + + auto field_by_name = const_mapping.Find("name"); + ASSERT_TRUE(field_by_name.has_value()); + EXPECT_EQ(field_by_name->get().field_id, 2); + EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); + + auto field_by_parts = const_mapping.Find(std::vector{"data", "value"}); + ASSERT_TRUE(field_by_parts.has_value()); + EXPECT_EQ(field_by_parts->get().field_id, 3); + EXPECT_THAT(field_by_parts->get().names, testing::UnorderedElementsAre("value")); +} + +TEST(NameMappingFromJsonTest, FromJsonWorks) { + std::string json_str = R"([ + {"field-id": 1, "names": ["id"]}, + {"field-id": 2, "names": ["name"]}, + {"field-id": 3, "names": ["data"], "fields": [ + {"field-id": 4, "names": ["value"]}, + {"field-id": 5, "names": ["description"]} + ]} + ])"; + + auto result = iceberg::NameMappingFromJson(nlohmann::json::parse(json_str)); + ASSERT_TRUE(result.has_value()); + + auto mapping = std::move(result.value()); + const NameMapping& const_mapping = *mapping; + + // Test that the parsed mapping works correctly + auto field_by_id = const_mapping.Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); + + auto field_by_name = const_mapping.Find("name"); + ASSERT_TRUE(field_by_name.has_value()); + EXPECT_EQ(field_by_name->get().field_id, 2); + EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); + + auto nested_field = const_mapping.Find("data.value"); + ASSERT_TRUE(nested_field.has_value()); + EXPECT_EQ(nested_field->get().field_id, 4); + EXPECT_THAT(nested_field->get().names, testing::UnorderedElementsAre("value")); +} + +TEST(NameMappingNestedFieldsTest, WithNestedFields) { + auto name_mapping = CreateTestNameMapping(); + const NameMapping& const_mapping = *name_mapping; + + // Test nested field access + auto data_field = const_mapping.Find("data"); + ASSERT_TRUE(data_field.has_value()); + EXPECT_EQ(data_field->get().field_id, 5); + EXPECT_THAT(data_field->get().names, testing::UnorderedElementsAre("data")); + + // Test that nested mapping exists + ASSERT_TRUE(data_field->get().nested_mapping != nullptr); + EXPECT_EQ(data_field->get().nested_mapping->Size(), 2); + + // Test nested field access through the nested mapping + auto value_field = data_field->get().nested_mapping->Field(3); + ASSERT_TRUE(value_field.has_value()); + EXPECT_EQ(value_field->get().field_id, 3); + EXPECT_THAT(value_field->get().names, testing::UnorderedElementsAre("value")); + + auto description_field = data_field->get().nested_mapping->Field(4); + ASSERT_TRUE(description_field.has_value()); + EXPECT_EQ(description_field->get().field_id, 4); + EXPECT_THAT(description_field->get().names, + testing::UnorderedElementsAre("description")); +} + +TEST(NameMappingReaderOptionsTest, FromReaderOptionsWorks) { + // Create a name mapping + auto name_mapping = CreateTestNameMapping(); + ASSERT_TRUE(name_mapping != nullptr); + EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); + + // Create reader options with name mapping + ReaderOptions options; + options.name_mapping = std::move(name_mapping); + + // Verify that the name mapping is accessible + ASSERT_TRUE(options.name_mapping != nullptr); + EXPECT_EQ(options.name_mapping->AsMappedFields().Size(), 3); + + // Test that the name mapping works correctly + auto field_by_id = options.name_mapping->Find(1); + ASSERT_TRUE(field_by_id.has_value()); + EXPECT_EQ(field_by_id->get().field_id, 1); + EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); +} + +} // namespace iceberg::avro From b4d110fe4a2209a4f24cad43b8f3af92025e4140 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Mon, 28 Jul 2025 21:10:16 +0800 Subject: [PATCH 07/15] fix applying field-ids on name mapping code --- src/iceberg/avro/avro_reader.cc | 6 +- src/iceberg/avro/avro_schema_util.cc | 242 +++++----- src/iceberg/avro/avro_schema_util_internal.h | 32 +- test/avro_schema_test.cc | 133 ------ test/name_mapping_test.cc | 443 +++++++++++++++++++ 5 files changed, 589 insertions(+), 267 deletions(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 3a823e85..d8df11d5 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -106,9 +106,13 @@ class AvroReader::Impl { if (has_id_visitor.HasNoIds()) { // Apply field IDs based on name mapping if available if (options.name_mapping) { + MappedField mapped_field; + // Convert NameMapping to MappedFields for nested mapping + mapped_field.nested_mapping = + std::make_shared(options.name_mapping->AsMappedFields()); ICEBERG_ASSIGN_OR_RAISE( auto new_root_node, - CreateAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); + CreateAvroNodeWithFieldIds(file_schema.root(), mapped_field)); // Create a new schema with the updated root node auto new_schema = ::avro::ValidSchema(new_root_node); diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 73a2cc38..15632826 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -775,80 +775,69 @@ Result Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } -// Helper function to create a new Avro node with field IDs from name mapping -Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping) { - switch (original_node->type()) { - case ::avro::AVRO_RECORD: - return CreateRecordNodeWithFieldIds(original_node, name_mapping); - case ::avro::AVRO_ARRAY: - return CreateArrayNodeWithFieldIds(original_node, name_mapping); - case ::avro::AVRO_MAP: - return CreateMapNodeWithFieldIds(original_node, name_mapping); - case ::avro::AVRO_UNION: - return CreateUnionNodeWithFieldIds(original_node, name_mapping); - case ::avro::AVRO_BOOL: - case ::avro::AVRO_INT: - case ::avro::AVRO_LONG: - case ::avro::AVRO_FLOAT: - case ::avro::AVRO_DOUBLE: - case ::avro::AVRO_STRING: - case ::avro::AVRO_BYTES: - case ::avro::AVRO_FIXED: - // For primitive types, just return a copy - return original_node; - case ::avro::AVRO_NULL: - case ::avro::AVRO_ENUM: - default: - return InvalidSchema("Unsupported Avro type for field ID application: {}", - ToString(original_node)); - } -} +namespace { Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping) { + const MappedField& field) { auto new_record_node = std::make_shared<::avro::NodeRecord>(); new_record_node->setName(original_node->name()); + if (original_node->leaves() > original_node->names()) { + return InvalidSchema("Node has {} leaves but only {} names", original_node->leaves(), + original_node->names()); + } + for (size_t i = 0; i < original_node->leaves(); ++i) { const std::string& field_name = original_node->nameAt(i); ::avro::NodePtr field_node = original_node->leafAt(i); - // Try to find field ID by name in the name mapping - if (auto field_ref = name_mapping.Find(field_name)) { - if (field_ref->get().field_id.has_value()) { - // Add field ID attribute to the new node - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kFieldIdProp), - std::to_string(field_ref->get().field_id.value()), false); - new_record_node->addCustomAttributesForField(attributes); + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } } + } - // Recursively apply field IDs to nested fields if they exist - if (field_ref->get().nested_mapping && field_node->type() == ::avro::AVRO_RECORD) { - const auto& nested_mapping = field_ref->get().nested_mapping; - auto fields_span = nested_mapping->fields(); - std::vector fields_vector(fields_span.begin(), fields_span.end()); - auto nested_name_mapping = NameMapping::Make(std::move(fields_vector)); + if (nested_field) { + // Check if field_id is present + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } - ICEBERG_ASSIGN_OR_RAISE( - auto new_nested_node, - CreateAvroNodeWithFieldIds(field_node, *nested_name_mapping)); - new_record_node->addName(field_name); - new_record_node->addLeaf(new_nested_node); - } else { - // Recursively apply field IDs to child nodes - ICEBERG_ASSIGN_OR_RAISE(auto new_field_node, - CreateAvroNodeWithFieldIds(field_node, name_mapping)); - new_record_node->addName(field_name); - new_record_node->addLeaf(new_field_node); + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + const auto& original_attrs = original_node->customAttributesAt(i); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + attributes.addAttribute(attr_pair.first, attr_pair.second, false); + } } - } else { - // Recursively apply field IDs to child nodes even if no mapping found - ICEBERG_ASSIGN_OR_RAISE(auto new_field_node, - CreateAvroNodeWithFieldIds(field_node, name_mapping)); + + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), false); + + if (!attributes.attributes().empty()) { + new_record_node->addCustomAttributesForField(attributes); + } + + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + CreateAvroNodeWithFieldIds(field_node, *nested_field)); new_record_node->addName(field_name); - new_record_node->addLeaf(new_field_node); + new_record_node->addLeaf(new_nested_node); + } else { + // If no nested field found, this is an error + return InvalidSchema("Field '{}' not found in nested mapping", field_name); } } @@ -856,79 +845,95 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig } Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping) { + const MappedField& field) { if (original_node->leaves() != 1) { return InvalidSchema("Array type must have exactly one leaf"); } auto new_array_node = std::make_shared<::avro::NodeArray>(); - new_array_node->setName(original_node->name()); - new_array_node->setLogicalType(original_node->logicalType()); // Check if this is a map represented as array - if (original_node->logicalType().type() == ::avro::LogicalType::CUSTOM && - original_node->logicalType().customLogicalType() != nullptr && - original_node->logicalType().customLogicalType()->name() == kMapLogicalType) { - ICEBERG_ASSIGN_OR_RAISE( - auto new_element_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(0), name_mapping)); + if (HasMapLogicalType(original_node)) { + ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, + CreateAvroNodeWithFieldIds(original_node->leafAt(0), field)); new_array_node->addLeaf(new_element_node); return new_array_node; } - // For regular arrays, try to find element field ID - if (auto element_field = name_mapping.Find(std::string(kElement))) { - if (element_field->get().field_id.has_value()) { - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kElementIdProp), - std::to_string(element_field->get().field_id.value()), - false); - new_array_node->addCustomAttributesForField(attributes); + // For regular arrays, try to find element field ID from nested mapping + const MappedField* element_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(std::string(kElement)) != f.names.end()) { + element_field = &f; + break; + } } } - ICEBERG_ASSIGN_OR_RAISE( - auto new_element_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(0), name_mapping)); - new_array_node->addLeaf(new_element_node); + if (element_field) { + // Check if field_id is present + if (!element_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for element field in array"); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + CreateAvroNodeWithFieldIds(original_node->leafAt(0), *element_field)); + new_array_node->addLeaf(new_element_node); + } else { + // If no element field found, this is an error + return InvalidSchema("Element field not found in nested mapping for array"); + } + return new_array_node; } Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping) { + const MappedField& field) { if (original_node->leaves() != 2) { return InvalidSchema("Map type must have exactly two leaves"); } auto new_map_node = std::make_shared<::avro::NodeMap>(); - new_map_node->setName(original_node->name()); - new_map_node->setLogicalType(original_node->logicalType()); - // Try to find key and value field IDs - if (auto key_field = name_mapping.Find(std::string(kKey))) { - if (key_field->get().field_id.has_value()) { - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kKeyIdProp), - std::to_string(key_field->get().field_id.value()), false); - new_map_node->addCustomAttributesForField(attributes); + // Try to find key and value fields from nested mapping + const MappedField* key_field = nullptr; + const MappedField* value_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(std::string(kKey)) != f.names.end()) { + key_field = &f; + } else if (f.names.find(std::string(kValue)) != f.names.end()) { + value_field = &f; + } } } - if (auto value_field = name_mapping.Find(std::string(kValue))) { - if (value_field->get().field_id.has_value()) { - ::avro::CustomAttributes attributes; - attributes.addAttribute(std::string(kValueIdProp), - std::to_string(value_field->get().field_id.value()), false); - new_map_node->addCustomAttributesForField(attributes); - } + // Check if both key and value fields are found + if (!key_field) { + return InvalidSchema("Key field not found in nested mapping for map"); + } + if (!value_field) { + return InvalidSchema("Value field not found in nested mapping for map"); + } + + // Check if field_ids are present + if (!key_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for key field in map"); + } + if (!value_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for value field in map"); } // Add key and value nodes ICEBERG_ASSIGN_OR_RAISE(auto new_key_node, CreateAvroNodeWithFieldIds( - original_node->leafAt(0), name_mapping)); + original_node->leafAt(0), *key_field)); ICEBERG_ASSIGN_OR_RAISE( auto new_value_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(1), name_mapping)); + CreateAvroNodeWithFieldIds(original_node->leafAt(1), *value_field)); new_map_node->addLeaf(new_key_node); new_map_node->addLeaf(new_value_node); @@ -936,7 +941,7 @@ Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& origina } Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping) { + const MappedField& field) { if (original_node->leaves() != 2) { return InvalidSchema("Union type must have exactly two branches"); } @@ -950,7 +955,7 @@ Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& origi if (branch_0_is_null && !branch_1_is_null) { // branch_0 is null, branch_1 is not null ICEBERG_ASSIGN_OR_RAISE(auto new_branch_1, - CreateAvroNodeWithFieldIds(branch_1, name_mapping)); + CreateAvroNodeWithFieldIds(branch_1, field)); auto new_union_node = std::make_shared<::avro::NodeUnion>(); new_union_node->addLeaf(branch_0); // null branch new_union_node->addLeaf(new_branch_1); @@ -958,7 +963,7 @@ Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& origi } else if (!branch_0_is_null && branch_1_is_null) { // branch_0 is not null, branch_1 is null ICEBERG_ASSIGN_OR_RAISE(auto new_branch_0, - CreateAvroNodeWithFieldIds(branch_0, name_mapping)); + CreateAvroNodeWithFieldIds(branch_0, field)); auto new_union_node = std::make_shared<::avro::NodeUnion>(); new_union_node->addLeaf(new_branch_0); new_union_node->addLeaf(branch_1); // null branch @@ -972,4 +977,35 @@ Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& origi } } +} // namespace + +Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& mapped_field) { + switch (original_node->type()) { + case ::avro::AVRO_RECORD: + return CreateRecordNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_ARRAY: + return CreateArrayNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_MAP: + return CreateMapNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_UNION: + return CreateUnionNodeWithFieldIds(original_node, mapped_field); + case ::avro::AVRO_BOOL: + case ::avro::AVRO_INT: + case ::avro::AVRO_LONG: + case ::avro::AVRO_FLOAT: + case ::avro::AVRO_DOUBLE: + case ::avro::AVRO_STRING: + case ::avro::AVRO_BYTES: + case ::avro::AVRO_FIXED: + // For primitive types, just return a copy + return original_node; + case ::avro::AVRO_NULL: + case ::avro::AVRO_ENUM: + default: + return InvalidSchema("Unsupported Avro type for field ID application: {}", + ToString(original_node)); + } +} + } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 7e6644f4..6b12b770 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -151,37 +151,9 @@ bool HasMapLogicalType(const ::avro::NodePtr& node); /// \brief Create a new Avro node with field IDs from name mapping. /// \param original_node The original Avro node to copy. -/// \param name_mapping The name mapping to apply field IDs from. +/// \param mapped_field The mapped field to apply field IDs from. /// \return A new Avro node with field IDs applied, or an error. Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping); - -/// \brief Create a new Avro record node with field IDs from name mapping. -/// \param original_node The original Avro record node to copy. -/// \param name_mapping The name mapping to apply field IDs from. -/// \return A new Avro record node with field IDs applied, or an error. -Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping); - -/// \brief Create a new Avro array node with field IDs from name mapping. -/// \param original_node The original Avro array node to copy. -/// \param name_mapping The name mapping to apply field IDs from. -/// \return A new Avro array node with field IDs applied, or an error. -Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping); - -/// \brief Create a new Avro map node with field IDs from name mapping. -/// \param original_node The original Avro map node to copy. -/// \param name_mapping The name mapping to apply field IDs from. -/// \return A new Avro map node with field IDs applied, or an error. -Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping); - -/// \brief Create a new Avro union node with field IDs from name mapping. -/// \param original_node The original Avro union node to copy. -/// \param name_mapping The name mapping to apply field IDs from. -/// \return A new Avro union node with field IDs applied, or an error. -Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& name_mapping); + const MappedField& mapped_field); } // namespace iceberg::avro diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index cc24d724..5d45dd42 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -1061,137 +1061,4 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { ASSERT_THAT(projection_result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(projection_result, HasErrorMessage("Cannot read")); } - -} // namespace iceberg::avro - -// NameMapping tests for Avro schema context -namespace iceberg::avro { - -std::unique_ptr CreateTestNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); - - // Create nested mapping for the data field - std::vector nested_fields; - nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); - nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); - auto nested_mapping = MappedFields::Make(std::move(nested_fields)); - - fields.emplace_back(MappedField{ - .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); - - return NameMapping::Make(std::move(fields)); -} - -TEST(NameMappingBasicTest, Basic) { - auto name_mapping = CreateTestNameMapping(); - - // Test that the name mapping can be created and accessed - EXPECT_TRUE(name_mapping != nullptr); - EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); -} - -TEST(NameMappingFindMethodsTest, FindMethodsWorkWithConst) { - auto name_mapping = CreateTestNameMapping(); - const NameMapping& const_mapping = *name_mapping; - - // Test that Find methods work on const objects - auto field_by_id = const_mapping.Find(1); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ(field_by_id->get().field_id, 1); - EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); - - auto field_by_name = const_mapping.Find("name"); - ASSERT_TRUE(field_by_name.has_value()); - EXPECT_EQ(field_by_name->get().field_id, 2); - EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); - - auto field_by_parts = const_mapping.Find(std::vector{"data", "value"}); - ASSERT_TRUE(field_by_parts.has_value()); - EXPECT_EQ(field_by_parts->get().field_id, 3); - EXPECT_THAT(field_by_parts->get().names, testing::UnorderedElementsAre("value")); -} - -TEST(NameMappingFromJsonTest, FromJsonWorks) { - std::string json_str = R"([ - {"field-id": 1, "names": ["id"]}, - {"field-id": 2, "names": ["name"]}, - {"field-id": 3, "names": ["data"], "fields": [ - {"field-id": 4, "names": ["value"]}, - {"field-id": 5, "names": ["description"]} - ]} - ])"; - - auto result = iceberg::NameMappingFromJson(nlohmann::json::parse(json_str)); - ASSERT_TRUE(result.has_value()); - - auto mapping = std::move(result.value()); - const NameMapping& const_mapping = *mapping; - - // Test that the parsed mapping works correctly - auto field_by_id = const_mapping.Find(1); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ(field_by_id->get().field_id, 1); - EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); - - auto field_by_name = const_mapping.Find("name"); - ASSERT_TRUE(field_by_name.has_value()); - EXPECT_EQ(field_by_name->get().field_id, 2); - EXPECT_THAT(field_by_name->get().names, testing::UnorderedElementsAre("name")); - - auto nested_field = const_mapping.Find("data.value"); - ASSERT_TRUE(nested_field.has_value()); - EXPECT_EQ(nested_field->get().field_id, 4); - EXPECT_THAT(nested_field->get().names, testing::UnorderedElementsAre("value")); -} - -TEST(NameMappingNestedFieldsTest, WithNestedFields) { - auto name_mapping = CreateTestNameMapping(); - const NameMapping& const_mapping = *name_mapping; - - // Test nested field access - auto data_field = const_mapping.Find("data"); - ASSERT_TRUE(data_field.has_value()); - EXPECT_EQ(data_field->get().field_id, 5); - EXPECT_THAT(data_field->get().names, testing::UnorderedElementsAre("data")); - - // Test that nested mapping exists - ASSERT_TRUE(data_field->get().nested_mapping != nullptr); - EXPECT_EQ(data_field->get().nested_mapping->Size(), 2); - - // Test nested field access through the nested mapping - auto value_field = data_field->get().nested_mapping->Field(3); - ASSERT_TRUE(value_field.has_value()); - EXPECT_EQ(value_field->get().field_id, 3); - EXPECT_THAT(value_field->get().names, testing::UnorderedElementsAre("value")); - - auto description_field = data_field->get().nested_mapping->Field(4); - ASSERT_TRUE(description_field.has_value()); - EXPECT_EQ(description_field->get().field_id, 4); - EXPECT_THAT(description_field->get().names, - testing::UnorderedElementsAre("description")); -} - -TEST(NameMappingReaderOptionsTest, FromReaderOptionsWorks) { - // Create a name mapping - auto name_mapping = CreateTestNameMapping(); - ASSERT_TRUE(name_mapping != nullptr); - EXPECT_EQ(name_mapping->AsMappedFields().Size(), 3); - - // Create reader options with name mapping - ReaderOptions options; - options.name_mapping = std::move(name_mapping); - - // Verify that the name mapping is accessible - ASSERT_TRUE(options.name_mapping != nullptr); - EXPECT_EQ(options.name_mapping->AsMappedFields().Size(), 3); - - // Test that the name mapping works correctly - auto field_by_id = options.name_mapping->Find(1); - ASSERT_TRUE(field_by_id.has_value()); - EXPECT_EQ(field_by_id->get().field_id, 1); - EXPECT_THAT(field_by_id->get().names, testing::UnorderedElementsAre("id")); -} - } // namespace iceberg::avro diff --git a/test/name_mapping_test.cc b/test/name_mapping_test.cc index 70f71560..560bd0f8 100644 --- a/test/name_mapping_test.cc +++ b/test/name_mapping_test.cc @@ -24,8 +24,18 @@ #include #include +#include +#include +#include #include #include +#include + +#include "iceberg/avro/avro_reader.h" +#include "iceberg/avro/avro_schema_util_internal.h" +#include "iceberg/file_reader.h" +#include "iceberg/json_internal.h" +#include "matchers.h" namespace iceberg { @@ -351,3 +361,436 @@ TEST(CreateMappingTest, ListSchemaToMapping) { } } // namespace iceberg + +// NameMapping tests for Avro schema context +namespace iceberg::avro { + +namespace { + +void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id, + const std::string& key = "field-id") { + ASSERT_LT(index, node->customAttributes()); + const auto& attrs = node->customAttributesAt(index); + ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id))); +} + +// Helper function to create a test name mapping +std::unique_ptr CreateTestNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + + // Create nested mapping for the data field + std::vector nested_fields; + nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); + nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); + auto nested_mapping = MappedFields::Make(std::move(nested_fields)); + + fields.emplace_back(MappedField{ + .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); + + return NameMapping::Make(std::move(fields)); +} + +} // namespace + +class NameMappingAvroSchemaTest : public ::testing::Test { + protected: + // Helper function to create a simple name mapping + std::unique_ptr CreateSimpleNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a nested name mapping + std::unique_ptr CreateNestedNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for address + std::vector address_fields; + address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); + address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); + address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); + auto address_mapping = MappedFields::Make(std::move(address_fields)); + + fields.emplace_back(MappedField{.names = {"address"}, + .field_id = 2, + .nested_mapping = std::move(address_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for array types + std::unique_ptr CreateArrayNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element + std::vector element_fields; + element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + fields.emplace_back(MappedField{ + .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for map types + std::unique_ptr CreateMapNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for map key-value + std::vector map_fields; + map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); + map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); + auto map_mapping = MappedFields::Make(std::move(map_fields)); + + fields.emplace_back(MappedField{.names = {"properties"}, + .field_id = 2, + .nested_mapping = std::move(map_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for union types + std::unique_ptr CreateUnionNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { + // Create a simple Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateSimpleNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 3); + EXPECT_EQ(new_node->leaves(), 3); + + // Check that field IDs are properly applied + ASSERT_EQ(new_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 2, 3)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { + // Create a nested Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "address", "type": { + "type": "record", + "name": "address", + "fields": [ + {"name": "street", "type": "string"}, + {"name": "city", "type": "string"}, + {"name": "zip", "type": "string"} + ] + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateNestedNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + + // Check nested record + const auto& address_node = new_node->leafAt(1); + EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(address_node->names(), 3); + EXPECT_EQ(address_node->leaves(), 3); + + // Check that field IDs are properly applied to nested fields + ASSERT_EQ(address_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 0, 10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 1, 11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 2, 12)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { + try { + // Create an Avro array schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateArrayNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check array field structure only - don't access any attributes + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Note: Array nodes don't support custom attributes in Avro C++ + // We only verify the structure is correct, not the attributes + } catch (const std::exception& e) { + // If we get an exception about attributes not being supported, that's expected + // for array nodes in Avro C++ + EXPECT_TRUE(std::string(e.what()).find("This type does not have attribute") != + std::string::npos); + } +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { + try { + // Create an Avro map schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateMapNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check map field structure only - don't access any attributes + const auto& map_node = new_node->leafAt(1); + EXPECT_EQ(map_node->type(), ::avro::AVRO_MAP); + ASSERT_GE(map_node->leaves(), 2); + EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); + EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); + + // Note: Map nodes don't support custom attributes in Avro C++ + // We only verify the structure is correct, not the attributes + } catch (const std::exception& e) { + // If we get an exception about attributes not being supported, that's expected + // for map nodes in Avro C++ + EXPECT_TRUE(std::string(e.what()).find("This type does not have attribute") != + std::string::npos); + } +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { + // Create an Avro union schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "data", "type": ["null", "string"]} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateUnionNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + + // Check union field + const auto& union_node = new_node->leafAt(1); + EXPECT_EQ(union_node->type(), ::avro::AVRO_UNION); + EXPECT_EQ(union_node->leaves(), 2); + + // Check that the non-null branch has field ID applied + const auto& non_null_branch = union_node->leafAt(1); + EXPECT_EQ(non_null_branch->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { + // Create a name mapping with missing field ID + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}}); // Missing field_id + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create a simple Avro record schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, + HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { + // Create a name mapping + auto name_mapping = CreateSimpleNameMapping(); + + // Create an Avro record schema with a field not in the mapping + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "unknown_field", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, + HasErrorMessage("Field 'unknown_field' not found in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { + // Create a name mapping without array element mapping + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create an Avro array schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { + // Create a name mapping without map key/value mapping + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create an Avro map schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); +} + +} // namespace iceberg::avro From 472fe6edf9a6ab1241041567681b27ded9da1f00 Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:46:48 +0800 Subject: [PATCH 08/15] Update src/iceberg/avro/avro_schema_util.cc Co-authored-by: Gang Wu --- src/iceberg/avro/avro_schema_util.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 15632826..cd2f171b 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -788,6 +788,9 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig } for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema(...); + } const std::string& field_name = original_node->nameAt(i); ::avro::NodePtr field_node = original_node->leafAt(i); From be71cefeb988a50a380920b8ffd37bafadd869e0 Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:47:00 +0800 Subject: [PATCH 09/15] Update src/iceberg/avro/avro_schema_util.cc Co-authored-by: Gang Wu --- src/iceberg/avro/avro_schema_util.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index cd2f171b..3cee4534 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -792,6 +792,9 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig return InvalidSchema(...); } const std::string& field_name = original_node->nameAt(i); + if (i >= original_node->leaves()) { + return InvalidSchema(...); + } ::avro::NodePtr field_node = original_node->leafAt(i); // TODO(liuxiaoyu): Add support for case sensitivity in name matching. From 7c922d4852bfdb16422f998f6631409b909cb792 Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:47:48 +0800 Subject: [PATCH 10/15] Update src/iceberg/avro/avro_schema_util.cc Co-authored-by: Gang Wu --- src/iceberg/avro/avro_schema_util.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 3cee4534..97c29013 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -782,11 +782,6 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig auto new_record_node = std::make_shared<::avro::NodeRecord>(); new_record_node->setName(original_node->name()); - if (original_node->leaves() > original_node->names()) { - return InvalidSchema("Node has {} leaves but only {} names", original_node->leaves(), - original_node->names()); - } - for (size_t i = 0; i < original_node->leaves(); ++i) { if (i >= original_node->names()) { return InvalidSchema(...); From 2482b6cc6afefa07131c0371492fb222aa2b29e7 Mon Sep 17 00:00:00 2001 From: liuxiaoyu <45345701+MisterRaindrop@users.noreply.github.com> Date: Mon, 14 Jul 2025 13:48:01 +0800 Subject: [PATCH 11/15] Update src/iceberg/avro/avro_schema_util.cc Co-authored-by: Gang Wu --- src/iceberg/avro/avro_schema_util.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 97c29013..22aba15a 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -827,9 +827,7 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig attributes.addAttribute(std::string(kFieldIdProp), std::to_string(nested_field->field_id.value()), false); - if (!attributes.attributes().empty()) { - new_record_node->addCustomAttributesForField(attributes); - } + new_record_node->addCustomAttributesForField(attributes); // Recursively apply field IDs to nested fields ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, From aa5b7258729ae450b92cc7319166854c02444e02 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Tue, 15 Jul 2025 10:52:29 +0800 Subject: [PATCH 12/15] fix name mapping and schema --- src/iceberg/avro/avro_reader.cc | 10 - src/iceberg/avro/avro_schema_util.cc | 53 ++-- test/avro_schema_test.cc | 399 +++++++++++++++++++++++- test/name_mapping_test.cc | 434 --------------------------- 4 files changed, 419 insertions(+), 477 deletions(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index d8df11d5..52dd3c1a 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -117,16 +117,6 @@ class AvroReader::Impl { // Create a new schema with the updated root node auto new_schema = ::avro::ValidSchema(new_root_node); - // Verify that all fields now have IDs after applying the name mapping - HasIdVisitor verify_visitor; - ICEBERG_RETURN_UNEXPECTED(verify_visitor.Visit(new_schema)); - if (!verify_visitor.AllHaveIds()) { - // TODO(liuxiaoyu): Print detailed error message with missing field IDs - // information in future - return InvalidSchema( - "Not all fields have field IDs after applying name mapping."); - } - // Update the file schema to use the new schema with field IDs file_schema = new_schema; } else { diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 22aba15a..4ce1b0fa 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -784,11 +784,13 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig for (size_t i = 0; i < original_node->leaves(); ++i) { if (i >= original_node->names()) { - return InvalidSchema(...); + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); } const std::string& field_name = original_node->nameAt(i); if (i >= original_node->leaves()) { - return InvalidSchema(...); + return InvalidSchema("Index {} is out of bounds for leaves (size: {})", i, + original_node->leaves()); } ::avro::NodePtr field_node = original_node->leafAt(i); @@ -897,42 +899,29 @@ Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& origina auto new_map_node = std::make_shared<::avro::NodeMap>(); - // Try to find key and value fields from nested mapping - const MappedField* key_field = nullptr; - const MappedField* value_field = nullptr; - if (field.nested_mapping) { - auto fields_span = field.nested_mapping->fields(); - for (const auto& f : fields_span) { - if (f.names.find(std::string(kKey)) != f.names.end()) { - key_field = &f; - } else if (f.names.find(std::string(kValue)) != f.names.end()) { - value_field = &f; - } - } - } + // For map types, we use fixed field IDs for key and value + // Key field gets field ID 0, value field gets field ID 1 + constexpr int32_t kMapKeyFieldId = 0; + constexpr int32_t kMapValueFieldId = 1; - // Check if both key and value fields are found - if (!key_field) { - return InvalidSchema("Key field not found in nested mapping for map"); - } - if (!value_field) { - return InvalidSchema("Value field not found in nested mapping for map"); - } + // Create key field with fixed field ID + MappedField key_field; + key_field.field_id = kMapKeyFieldId; + key_field.nested_mapping = + field.nested_mapping; // Pass through nested mapping for complex key types - // Check if field_ids are present - if (!key_field->field_id.has_value()) { - return InvalidSchema("Field ID is missing for key field in map"); - } - if (!value_field->field_id.has_value()) { - return InvalidSchema("Field ID is missing for value field in map"); - } + // Create value field with fixed field ID + MappedField value_field; + value_field.field_id = kMapValueFieldId; + value_field.nested_mapping = + field.nested_mapping; // Pass through nested mapping for complex value types // Add key and value nodes - ICEBERG_ASSIGN_OR_RAISE(auto new_key_node, CreateAvroNodeWithFieldIds( - original_node->leafAt(0), *key_field)); + ICEBERG_ASSIGN_OR_RAISE( + auto new_key_node, CreateAvroNodeWithFieldIds(original_node->leafAt(0), key_field)); ICEBERG_ASSIGN_OR_RAISE( auto new_value_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(1), *value_field)); + CreateAvroNodeWithFieldIds(original_node->leafAt(1), value_field)); new_map_node->addLeaf(new_key_node); new_map_node->addLeaf(new_value_node); diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index 5d45dd42..acbc24cb 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -51,6 +50,24 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id, ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id))); } +// Helper function to create a test name mapping +std::unique_ptr CreateTestNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + + // Create nested mapping for the data field + std::vector nested_fields; + nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); + nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); + auto nested_mapping = MappedFields::Make(std::move(nested_fields)); + + fields.emplace_back(MappedField{ + .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); + + return NameMapping::Make(std::move(fields)); +} + } // namespace TEST(ToAvroNodeVisitorTest, BooleanType) { @@ -1061,4 +1078,384 @@ TEST(AvroSchemaProjectionTest, ProjectDecimalIncompatible) { ASSERT_THAT(projection_result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(projection_result, HasErrorMessage("Cannot read")); } + +// NameMapping tests for Avro schema context +class NameMappingAvroSchemaTest : public ::testing::Test { + protected: + // Helper function to create a simple name mapping + std::unique_ptr CreateSimpleNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); + fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a nested name mapping + std::unique_ptr CreateNestedNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for address + std::vector address_fields; + address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); + address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); + address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); + auto address_mapping = MappedFields::Make(std::move(address_fields)); + + fields.emplace_back(MappedField{.names = {"address"}, + .field_id = 2, + .nested_mapping = std::move(address_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for array types + std::unique_ptr CreateArrayNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element + std::vector element_fields; + element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + fields.emplace_back(MappedField{ + .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for map types + std::unique_ptr CreateMapNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for map key-value + std::vector map_fields; + map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); + map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); + auto map_mapping = MappedFields::Make(std::move(map_fields)); + + fields.emplace_back(MappedField{.names = {"properties"}, + .field_id = 2, + .nested_mapping = std::move(map_mapping)}); + + return NameMapping::Make(std::move(fields)); + } + + // Helper function to create a name mapping for union types + std::unique_ptr CreateUnionNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { + // Create a simple Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateSimpleNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 3); + EXPECT_EQ(new_node->leaves(), 3); + + // Check that field IDs are properly applied + ASSERT_EQ(new_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 2, 3)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { + // Create a nested Avro record schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "address", "type": { + "type": "record", + "name": "address", + "fields": [ + {"name": "street", "type": "string"}, + {"name": "city", "type": "string"}, + {"name": "zip", "type": "string"} + ] + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateNestedNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + + // Check nested record + const auto& address_node = new_node->leafAt(1); + EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(address_node->names(), 3); + EXPECT_EQ(address_node->leaves(), 3); + + // Check that field IDs are properly applied to nested fields + ASSERT_EQ(address_node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 0, 10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 1, 11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 2, 12)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { + // Create an Avro array schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateArrayNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check array field structure only - don't access any attributes + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { + // Create an Avro map schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateMapNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check map field structure only - don't access any attributes + const auto& map_node = new_node->leafAt(1); + EXPECT_EQ(map_node->type(), ::avro::AVRO_MAP); + ASSERT_GE(map_node->leaves(), 2); + EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); + EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { + // Create an Avro union schema without field IDs + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "data", "type": ["null", "string"]} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateUnionNameMapping(); + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + + // Check union field + const auto& union_node = new_node->leafAt(1); + EXPECT_EQ(union_node->type(), ::avro::AVRO_UNION); + EXPECT_EQ(union_node->leaves(), 2); + + // Check that the non-null branch has field ID applied + const auto& non_null_branch = union_node->leafAt(1); + EXPECT_EQ(non_null_branch->type(), ::avro::AVRO_STRING); +} + +TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { + // Create a name mapping with missing field ID + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"name"}}); // Missing field_id + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create a simple Avro record schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, + HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { + // Create a name mapping + auto name_mapping = CreateSimpleNameMapping(); + + // Create an Avro record schema with a field not in the mapping + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "name", "type": "string"}, + {"name": "unknown_field", "type": "int"} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, + HasErrorMessage("Field 'unknown_field' not found in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { + // Create a name mapping without array element mapping + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create an Avro array schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "items", "type": { + "type": "array", + "items": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); +} + +TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { + // Create a name mapping without map key/value mapping + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + auto name_mapping = NameMapping::Make(std::move(fields)); + + // Create an Avro map schema + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "properties", "type": { + "type": "map", + "values": "string" + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + MappedField mapped_field; + mapped_field.nested_mapping = + std::make_shared(name_mapping->AsMappedFields()); + + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); +} } // namespace iceberg::avro diff --git a/test/name_mapping_test.cc b/test/name_mapping_test.cc index 560bd0f8..1335b7e8 100644 --- a/test/name_mapping_test.cc +++ b/test/name_mapping_test.cc @@ -29,7 +29,6 @@ #include #include #include -#include #include "iceberg/avro/avro_reader.h" #include "iceberg/avro/avro_schema_util_internal.h" @@ -361,436 +360,3 @@ TEST(CreateMappingTest, ListSchemaToMapping) { } } // namespace iceberg - -// NameMapping tests for Avro schema context -namespace iceberg::avro { - -namespace { - -void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id, - const std::string& key = "field-id") { - ASSERT_LT(index, node->customAttributes()); - const auto& attrs = node->customAttributesAt(index); - ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id))); -} - -// Helper function to create a test name mapping -std::unique_ptr CreateTestNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); - - // Create nested mapping for the data field - std::vector nested_fields; - nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); - nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); - auto nested_mapping = MappedFields::Make(std::move(nested_fields)); - - fields.emplace_back(MappedField{ - .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); - - return NameMapping::Make(std::move(fields)); -} - -} // namespace - -class NameMappingAvroSchemaTest : public ::testing::Test { - protected: - // Helper function to create a simple name mapping - std::unique_ptr CreateSimpleNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); - fields.emplace_back(MappedField{.names = {"age"}, .field_id = 3}); - return NameMapping::Make(std::move(fields)); - } - - // Helper function to create a nested name mapping - std::unique_ptr CreateNestedNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - - // Nested mapping for address - std::vector address_fields; - address_fields.emplace_back(MappedField{.names = {"street"}, .field_id = 10}); - address_fields.emplace_back(MappedField{.names = {"city"}, .field_id = 11}); - address_fields.emplace_back(MappedField{.names = {"zip"}, .field_id = 12}); - auto address_mapping = MappedFields::Make(std::move(address_fields)); - - fields.emplace_back(MappedField{.names = {"address"}, - .field_id = 2, - .nested_mapping = std::move(address_mapping)}); - - return NameMapping::Make(std::move(fields)); - } - - // Helper function to create a name mapping for array types - std::unique_ptr CreateArrayNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - - // Nested mapping for array element - std::vector element_fields; - element_fields.emplace_back(MappedField{.names = {"element"}, .field_id = 20}); - auto element_mapping = MappedFields::Make(std::move(element_fields)); - - fields.emplace_back(MappedField{ - .names = {"items"}, .field_id = 2, .nested_mapping = std::move(element_mapping)}); - - return NameMapping::Make(std::move(fields)); - } - - // Helper function to create a name mapping for map types - std::unique_ptr CreateMapNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - - // Nested mapping for map key-value - std::vector map_fields; - map_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 30}); - map_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 31}); - auto map_mapping = MappedFields::Make(std::move(map_fields)); - - fields.emplace_back(MappedField{.names = {"properties"}, - .field_id = 2, - .nested_mapping = std::move(map_mapping)}); - - return NameMapping::Make(std::move(fields)); - } - - // Helper function to create a name mapping for union types - std::unique_ptr CreateUnionNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); - return NameMapping::Make(std::move(fields)); - } -}; - -TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { - // Create a simple Avro record schema without field IDs - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "name", "type": "string"}, - {"name": "age", "type": "int"} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto name_mapping = CreateSimpleNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsOk()); - - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 3); - EXPECT_EQ(new_node->leaves(), 3); - - // Check that field IDs are properly applied - ASSERT_EQ(new_node->customAttributes(), 3); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 2, 3)); -} - -TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { - // Create a nested Avro record schema without field IDs - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "address", "type": { - "type": "record", - "name": "address", - "fields": [ - {"name": "street", "type": "string"}, - {"name": "city", "type": "string"}, - {"name": "zip", "type": "string"} - ] - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto name_mapping = CreateNestedNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsOk()); - - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 2); - EXPECT_EQ(new_node->leaves(), 2); - - // Check that field IDs are properly applied to top-level fields - ASSERT_EQ(new_node->customAttributes(), 2); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); - - // Check nested record - const auto& address_node = new_node->leafAt(1); - EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(address_node->names(), 3); - EXPECT_EQ(address_node->leaves(), 3); - - // Check that field IDs are properly applied to nested fields - ASSERT_EQ(address_node->customAttributes(), 3); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 0, 10)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 1, 11)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 2, 12)); -} - -TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { - try { - // Create an Avro array schema without field IDs - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "items", "type": { - "type": "array", - "items": "string" - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto name_mapping = CreateArrayNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsOk()); - - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 2); - EXPECT_EQ(new_node->leaves(), 2); - - // Check array field structure only - don't access any attributes - const auto& array_node = new_node->leafAt(1); - EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); - EXPECT_EQ(array_node->leaves(), 1); - - // Note: Array nodes don't support custom attributes in Avro C++ - // We only verify the structure is correct, not the attributes - } catch (const std::exception& e) { - // If we get an exception about attributes not being supported, that's expected - // for array nodes in Avro C++ - EXPECT_TRUE(std::string(e.what()).find("This type does not have attribute") != - std::string::npos); - } -} - -TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { - try { - // Create an Avro map schema without field IDs - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "properties", "type": { - "type": "map", - "values": "string" - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto name_mapping = CreateMapNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsOk()); - - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 2); - EXPECT_EQ(new_node->leaves(), 2); - - // Check map field structure only - don't access any attributes - const auto& map_node = new_node->leafAt(1); - EXPECT_EQ(map_node->type(), ::avro::AVRO_MAP); - ASSERT_GE(map_node->leaves(), 2); - EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); - EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); - - // Note: Map nodes don't support custom attributes in Avro C++ - // We only verify the structure is correct, not the attributes - } catch (const std::exception& e) { - // If we get an exception about attributes not being supported, that's expected - // for map nodes in Avro C++ - EXPECT_TRUE(std::string(e.what()).find("This type does not have attribute") != - std::string::npos); - } -} - -TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { - // Create an Avro union schema without field IDs - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "data", "type": ["null", "string"]} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto name_mapping = CreateUnionNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsOk()); - - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 2); - EXPECT_EQ(new_node->leaves(), 2); - - // Check that field IDs are properly applied to top-level fields - ASSERT_EQ(new_node->customAttributes(), 2); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); - - // Check union field - const auto& union_node = new_node->leafAt(1); - EXPECT_EQ(union_node->type(), ::avro::AVRO_UNION); - EXPECT_EQ(union_node->leaves(), 2); - - // Check that the non-null branch has field ID applied - const auto& non_null_branch = union_node->leafAt(1); - EXPECT_EQ(non_null_branch->type(), ::avro::AVRO_STRING); -} - -TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { - // Create a name mapping with missing field ID - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"name"}}); // Missing field_id - auto name_mapping = NameMapping::Make(std::move(fields)); - - // Create a simple Avro record schema - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "name", "type": "string"} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, - HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); -} - -TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { - // Create a name mapping - auto name_mapping = CreateSimpleNameMapping(); - - // Create an Avro record schema with a field not in the mapping - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "name", "type": "string"}, - {"name": "unknown_field", "type": "int"} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, - HasErrorMessage("Field 'unknown_field' not found in nested mapping")); -} - -TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { - // Create a name mapping without array element mapping - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - auto name_mapping = NameMapping::Make(std::move(fields)); - - // Create an Avro array schema - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "items", "type": { - "type": "array", - "items": "string" - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); -} - -TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { - // Create a name mapping without map key/value mapping - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - auto name_mapping = NameMapping::Make(std::move(fields)); - - // Create an Avro map schema - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "properties", "type": { - "type": "map", - "values": "string" - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); -} - -} // namespace iceberg::avro From bd8638fb9751ced1f5c967ac5fd82c00956e7769 Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Sun, 27 Jul 2025 09:22:01 +0800 Subject: [PATCH 13/15] update code --- src/iceberg/avro/avro_reader.cc | 6 +- src/iceberg/avro/avro_schema_util.cc | 7 + src/iceberg/avro/avro_schema_util_internal.h | 7 + test/avro_schema_test.cc | 129 +++++++------------ test/name_mapping_test.cc | 9 -- 5 files changed, 61 insertions(+), 97 deletions(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 52dd3c1a..c5c556fd 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -106,13 +106,9 @@ class AvroReader::Impl { if (has_id_visitor.HasNoIds()) { // Apply field IDs based on name mapping if available if (options.name_mapping) { - MappedField mapped_field; - // Convert NameMapping to MappedFields for nested mapping - mapped_field.nested_mapping = - std::make_shared(options.name_mapping->AsMappedFields()); ICEBERG_ASSIGN_OR_RAISE( auto new_root_node, - CreateAvroNodeWithFieldIds(file_schema.root(), mapped_field)); + CreateAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); // Create a new schema with the updated root node auto new_schema = ::avro::ValidSchema(new_root_node); diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 4ce1b0fa..3bbe6007 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -996,4 +996,11 @@ Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& origin } } +Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& mapping) { + MappedField mapped_field; + mapped_field.nested_mapping = std::make_shared(mapping.AsMappedFields()); + return CreateAvroNodeWithFieldIds(original_node, mapped_field); +} + } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 6b12b770..9a1bd638 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -156,4 +156,11 @@ bool HasMapLogicalType(const ::avro::NodePtr& node); Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, const MappedField& mapped_field); +/// \brief Create a new Avro node with field IDs from name mapping. +/// \param original_node The original Avro node to copy. +/// \param mapping The name mapping to apply field IDs from. +/// \return A new Avro node with field IDs applied, or an error. +Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& mapping); + } // namespace iceberg::avro diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index acbc24cb..e1642c99 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -23,9 +23,7 @@ #include #include #include -#include -#include "iceberg/avro/avro_reader.h" #include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/json_internal.h" #include "iceberg/metadata_columns.h" @@ -50,24 +48,6 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id, ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id))); } -// Helper function to create a test name mapping -std::unique_ptr CreateTestNameMapping() { - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - fields.emplace_back(MappedField{.names = {"name"}, .field_id = 2}); - - // Create nested mapping for the data field - std::vector nested_fields; - nested_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 3}); - nested_fields.emplace_back(MappedField{.names = {"description"}, .field_id = 4}); - auto nested_mapping = MappedFields::Make(std::move(nested_fields)); - - fields.emplace_back(MappedField{ - .names = {"data"}, .field_id = 5, .nested_mapping = std::move(nested_mapping)}); - - return NameMapping::Make(std::move(fields)); -} - } // namespace TEST(ToAvroNodeVisitorTest, BooleanType) { @@ -1167,23 +1147,20 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); auto name_mapping = CreateSimpleNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 3); - EXPECT_EQ(new_node->leaves(), 3); + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 3); + EXPECT_EQ(node->leaves(), 3); // Check that field IDs are properly applied - ASSERT_EQ(new_node->customAttributes(), 3); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 2, 3)); + ASSERT_EQ(node->customAttributes(), 3); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3)); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { @@ -1207,34 +1184,31 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); auto name_mapping = CreateNestedNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); - const auto& new_node = *result; - EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); - EXPECT_EQ(new_node->names(), 2); - EXPECT_EQ(new_node->leaves(), 2); + const auto& node = *result; + EXPECT_EQ(node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(node->names(), 2); + EXPECT_EQ(node->leaves(), 2); // Check that field IDs are properly applied to top-level fields - ASSERT_EQ(new_node->customAttributes(), 2); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + ASSERT_EQ(node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2)); // Check nested record - const auto& address_node = new_node->leafAt(1); + const auto& address_node = node->leafAt(1); EXPECT_EQ(address_node->type(), ::avro::AVRO_RECORD); EXPECT_EQ(address_node->names(), 3); EXPECT_EQ(address_node->leaves(), 3); // Check that field IDs are properly applied to nested fields ASSERT_EQ(address_node->customAttributes(), 3); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 0, 10)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 1, 11)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, 2, 12)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/0, /*field_id=*/10)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/1, /*field_id=*/11)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(address_node, /*index=*/2, /*field_id=*/12)); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { @@ -1253,11 +1227,8 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); auto name_mapping = CreateArrayNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& new_node = *result; @@ -1265,10 +1236,19 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { EXPECT_EQ(new_node->names(), 2); EXPECT_EQ(new_node->leaves(), 2); - // Check array field structure only - don't access any attributes + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure and element field ID const auto& array_node = new_node->leafAt(1); EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); EXPECT_EQ(array_node->leaves(), 1); + + // Check that array element has field ID applied + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { @@ -1287,11 +1267,8 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); auto name_mapping = CreateMapNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& new_node = *result; @@ -1299,7 +1276,12 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { EXPECT_EQ(new_node->names(), 2); EXPECT_EQ(new_node->leaves(), 2); - // Check map field structure only - don't access any attributes + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check map field structure and key-value field IDs const auto& map_node = new_node->leafAt(1); EXPECT_EQ(map_node->type(), ::avro::AVRO_MAP); ASSERT_GE(map_node->leaves(), 2); @@ -1320,11 +1302,8 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); auto name_mapping = CreateUnionNameMapping(); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& new_node = *result; @@ -1334,8 +1313,8 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { // Check that field IDs are properly applied to top-level fields ASSERT_EQ(new_node->customAttributes(), 2); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 0, 1)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, 1, 2)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); // Check union field const auto& union_node = new_node->leafAt(1); @@ -1365,11 +1344,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); @@ -1391,11 +1366,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field 'unknown_field' not found in nested mapping")); @@ -1421,11 +1392,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); } @@ -1450,11 +1417,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - MappedField mapped_field; - mapped_field.nested_mapping = - std::make_shared(name_mapping->AsMappedFields()); - - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), mapped_field); + auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); } diff --git a/test/name_mapping_test.cc b/test/name_mapping_test.cc index 1335b7e8..70f71560 100644 --- a/test/name_mapping_test.cc +++ b/test/name_mapping_test.cc @@ -24,18 +24,9 @@ #include #include -#include -#include -#include #include #include -#include "iceberg/avro/avro_reader.h" -#include "iceberg/avro/avro_schema_util_internal.h" -#include "iceberg/file_reader.h" -#include "iceberg/json_internal.h" -#include "matchers.h" - namespace iceberg { class NameMappingTest : public ::testing::Test { From 94cee21ed6a1db0886e42b03de172b310b90eada Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Fri, 25 Jul 2025 23:24:37 +0800 Subject: [PATCH 14/15] add field-id for array and map --- src/iceberg/avro/avro_reader.cc | 7 +- src/iceberg/avro/avro_schema_util.cc | 108 +++++++++++++------ src/iceberg/avro/avro_schema_util_internal.h | 8 +- test/avro_schema_test.cc | 106 ++++++++++++++++-- 4 files changed, 180 insertions(+), 49 deletions(-) diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index c5c556fd..ef7231ee 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -108,13 +108,10 @@ class AvroReader::Impl { if (options.name_mapping) { ICEBERG_ASSIGN_OR_RAISE( auto new_root_node, - CreateAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); - - // Create a new schema with the updated root node - auto new_schema = ::avro::ValidSchema(new_root_node); + MakeAvroNodeWithFieldIds(file_schema.root(), *options.name_mapping)); // Update the file schema to use the new schema with field IDs - file_schema = new_schema; + file_schema = ::avro::ValidSchema(new_root_node); } else { return InvalidSchema( "Avro file schema has no field IDs and no name mapping provided"); diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index 3bbe6007..e30bdbf3 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -788,10 +788,6 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig original_node->names()); } const std::string& field_name = original_node->nameAt(i); - if (i >= original_node->leaves()) { - return InvalidSchema("Index {} is out of bounds for leaves (size: {})", i, - original_node->leaves()); - } ::avro::NodePtr field_node = original_node->leafAt(i); // TODO(liuxiaoyu): Add support for case sensitivity in name matching. @@ -821,6 +817,7 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig const auto& original_attrs = original_node->customAttributesAt(i); const auto& existing_attrs = original_attrs.attributes(); for (const auto& attr_pair : existing_attrs) { + // Copy each existing attribute to preserve original metadata attributes.addAttribute(attr_pair.first, attr_pair.second, false); } } @@ -833,7 +830,7 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig // Recursively apply field IDs to nested fields ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, - CreateAvroNodeWithFieldIds(field_node, *nested_field)); + MakeAvroNodeWithFieldIds(field_node, *nested_field)); new_record_node->addName(field_name); new_record_node->addLeaf(new_nested_node); } else { @@ -856,7 +853,7 @@ Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& origi // Check if this is a map represented as array if (HasMapLogicalType(original_node)) { ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(0), field)); + MakeAvroNodeWithFieldIds(original_node->leafAt(0), field)); new_array_node->addLeaf(new_element_node); return new_array_node; } @@ -881,8 +878,14 @@ Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& origi ICEBERG_ASSIGN_OR_RAISE( auto new_element_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(0), *element_field)); + MakeAvroNodeWithFieldIds(original_node->leafAt(0), *element_field)); new_array_node->addLeaf(new_element_node); + + // Add element field ID as custom attribute + ::avro::CustomAttributes element_attributes; + element_attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(*element_field->field_id), false); + new_array_node->addCustomAttributesForField(element_attributes); } else { // If no element field found, this is an error return InvalidSchema("Element field not found in nested mapping for array"); @@ -899,32 +902,77 @@ Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& origina auto new_map_node = std::make_shared<::avro::NodeMap>(); - // For map types, we use fixed field IDs for key and value - // Key field gets field ID 0, value field gets field ID 1 - constexpr int32_t kMapKeyFieldId = 0; - constexpr int32_t kMapValueFieldId = 1; + // For map types, we need to extract key and value field mappings from the nested + // mapping + if (!field.nested_mapping) { + return InvalidSchema("Map type requires nested mapping for key and value fields"); + } + + // Find key and value field mappings by name + std::optional key_id = field.nested_mapping->Id("key"); + std::optional value_id = field.nested_mapping->Id("value"); + + if (!key_id || !value_id) { + return InvalidSchema("Map type requires both 'key' and 'value' field mappings"); + } + + std::optional key_field_ref = field.nested_mapping->Field(*key_id); + std::optional value_field_ref = + field.nested_mapping->Field(*value_id); + + if (!key_field_ref || !value_field_ref) { + return InvalidSchema("Map type requires both key and value field mappings"); + } + + const auto& key_mapped_field = key_field_ref->get(); + const auto& value_mapped_field = value_field_ref->get(); - // Create key field with fixed field ID + if (!key_mapped_field.field_id || !value_mapped_field.field_id) { + return InvalidSchema("Map key and value fields must have field IDs"); + } + + // Create key field with mapped field ID MappedField key_field; - key_field.field_id = kMapKeyFieldId; - key_field.nested_mapping = - field.nested_mapping; // Pass through nested mapping for complex key types + key_field.field_id = *key_mapped_field.field_id; + key_field.nested_mapping = key_mapped_field.nested_mapping; - // Create value field with fixed field ID + // Create value field with mapped field ID MappedField value_field; - value_field.field_id = kMapValueFieldId; - value_field.nested_mapping = - field.nested_mapping; // Pass through nested mapping for complex value types + value_field.field_id = *value_mapped_field.field_id; + value_field.nested_mapping = value_mapped_field.nested_mapping; // Add key and value nodes - ICEBERG_ASSIGN_OR_RAISE( - auto new_key_node, CreateAvroNodeWithFieldIds(original_node->leafAt(0), key_field)); + ICEBERG_ASSIGN_OR_RAISE(auto new_key_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), key_field)); ICEBERG_ASSIGN_OR_RAISE( auto new_value_node, - CreateAvroNodeWithFieldIds(original_node->leafAt(1), value_field)); + MakeAvroNodeWithFieldIds(original_node->leafAt(1), value_field)); new_map_node->addLeaf(new_key_node); new_map_node->addLeaf(new_value_node); + // Preserve existing custom attributes from the original node and add field ID + // attributes Copy existing attributes from the original node (if any) + if (original_node->customAttributes() > 0) { + const auto& original_attrs = original_node->customAttributesAt(0); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + // Copy each existing attribute to preserve original metadata + ::avro::CustomAttributes attributes; + attributes.addAttribute(attr_pair.first, attr_pair.second, false); + new_map_node->addCustomAttributesForField(attributes); + } + } + + ::avro::CustomAttributes key_attributes; + key_attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(*key_mapped_field.field_id), false); + new_map_node->addCustomAttributesForField(key_attributes); + + ::avro::CustomAttributes value_attributes; + value_attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(*value_mapped_field.field_id), false); + new_map_node->addCustomAttributesForField(value_attributes); + return new_map_node; } @@ -942,16 +990,14 @@ Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& origi if (branch_0_is_null && !branch_1_is_null) { // branch_0 is null, branch_1 is not null - ICEBERG_ASSIGN_OR_RAISE(auto new_branch_1, - CreateAvroNodeWithFieldIds(branch_1, field)); + ICEBERG_ASSIGN_OR_RAISE(auto new_branch_1, MakeAvroNodeWithFieldIds(branch_1, field)); auto new_union_node = std::make_shared<::avro::NodeUnion>(); new_union_node->addLeaf(branch_0); // null branch new_union_node->addLeaf(new_branch_1); return new_union_node; } else if (!branch_0_is_null && branch_1_is_null) { // branch_0 is not null, branch_1 is null - ICEBERG_ASSIGN_OR_RAISE(auto new_branch_0, - CreateAvroNodeWithFieldIds(branch_0, field)); + ICEBERG_ASSIGN_OR_RAISE(auto new_branch_0, MakeAvroNodeWithFieldIds(branch_0, field)); auto new_union_node = std::make_shared<::avro::NodeUnion>(); new_union_node->addLeaf(new_branch_0); new_union_node->addLeaf(branch_1); // null branch @@ -967,8 +1013,8 @@ Result<::avro::NodePtr> CreateUnionNodeWithFieldIds(const ::avro::NodePtr& origi } // namespace -Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, - const MappedField& mapped_field) { +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& mapped_field) { switch (original_node->type()) { case ::avro::AVRO_RECORD: return CreateRecordNodeWithFieldIds(original_node, mapped_field); @@ -996,11 +1042,11 @@ Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& origin } } -Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& mapping) { +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& mapping) { MappedField mapped_field; mapped_field.nested_mapping = std::make_shared(mapping.AsMappedFields()); - return CreateAvroNodeWithFieldIds(original_node, mapped_field); + return MakeAvroNodeWithFieldIds(original_node, mapped_field); } } // namespace iceberg::avro diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index 9a1bd638..ac008345 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -153,14 +153,14 @@ bool HasMapLogicalType(const ::avro::NodePtr& node); /// \param original_node The original Avro node to copy. /// \param mapped_field The mapped field to apply field IDs from. /// \return A new Avro node with field IDs applied, or an error. -Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, - const MappedField& mapped_field); +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& mapped_field); /// \brief Create a new Avro node with field IDs from name mapping. /// \param original_node The original Avro node to copy. /// \param mapping The name mapping to apply field IDs from. /// \return A new Avro node with field IDs applied, or an error. -Result<::avro::NodePtr> CreateAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, - const NameMapping& mapping); +Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node, + const NameMapping& mapping); } // namespace iceberg::avro diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index e1642c99..1e971ec0 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -1131,6 +1131,31 @@ class NameMappingAvroSchemaTest : public ::testing::Test { fields.emplace_back(MappedField{.names = {"data"}, .field_id = 2}); return NameMapping::Make(std::move(fields)); } + + // Helper function to create a name mapping for complex map types + // (array>) + std::unique_ptr CreateComplexMapNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); + + // Nested mapping for array element (struct) + std::vector element_fields; + element_fields.emplace_back(MappedField{.names = {"key"}, .field_id = 40}); + element_fields.emplace_back(MappedField{.names = {"value"}, .field_id = 41}); + auto element_mapping = MappedFields::Make(std::move(element_fields)); + + // Nested mapping for array + std::vector array_fields; + array_fields.emplace_back(MappedField{.names = {"element"}, + .field_id = 50, + .nested_mapping = std::move(element_mapping)}); + auto array_mapping = MappedFields::Make(std::move(array_fields)); + + fields.emplace_back(MappedField{ + .names = {"entries"}, .field_id = 2, .nested_mapping = std::move(array_mapping)}); + + return NameMapping::Make(std::move(fields)); + } }; TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { @@ -1148,7 +1173,7 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToRecord) { auto name_mapping = CreateSimpleNameMapping(); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& node = *result; @@ -1185,7 +1210,7 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToNestedRecord) { auto name_mapping = CreateNestedNameMapping(); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& node = *result; @@ -1228,7 +1253,7 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { auto name_mapping = CreateArrayNameMapping(); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& new_node = *result; @@ -1249,6 +1274,7 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { // Check that array element has field ID applied const auto& element_node = array_node->leafAt(0); EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(array_node, /*index=*/0, /*field_id=*/20)); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { @@ -1268,7 +1294,7 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { auto name_mapping = CreateMapNameMapping(); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& new_node = *result; @@ -1287,6 +1313,68 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { ASSERT_GE(map_node->leaves(), 2); EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); + ASSERT_EQ(map_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/0, /*field_id=*/30)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/1, /*field_id=*/31)); +} + +TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToComplexMap) { + // Create an Avro schema for complex map (array>) without field IDs + // This represents a map where key is not a string type + std::string avro_schema_json = R"({ + "type": "record", + "name": "test_record", + "fields": [ + {"name": "id", "type": "int"}, + {"name": "entries", "type": { + "type": "array", + "items": { + "type": "record", + "name": "entry", + "fields": [ + {"name": "key", "type": "int"}, + {"name": "value", "type": "string"} + ] + } + }} + ] + })"; + auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); + + auto name_mapping = CreateComplexMapNameMapping(); + + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + ASSERT_THAT(result, IsOk()); + + const auto& new_node = *result; + EXPECT_EQ(new_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(new_node->names(), 2); + EXPECT_EQ(new_node->leaves(), 2); + + // Check that field IDs are properly applied to top-level fields + ASSERT_EQ(new_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/0, /*field_id=*/1)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(new_node, /*index=*/1, /*field_id=*/2)); + + // Check array field structure (representing the map) + const auto& array_node = new_node->leafAt(1); + EXPECT_EQ(array_node->type(), ::avro::AVRO_ARRAY); + EXPECT_EQ(array_node->leaves(), 1); + + // Check array element (struct) + const auto& element_node = array_node->leafAt(0); + EXPECT_EQ(element_node->type(), ::avro::AVRO_RECORD); + EXPECT_EQ(element_node->names(), 2); + EXPECT_EQ(element_node->leaves(), 2); + + // Check that field IDs are properly applied to struct fields + ASSERT_EQ(element_node->customAttributes(), 2); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(element_node, /*index=*/0, /*field_id=*/40)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(element_node, /*index=*/1, /*field_id=*/41)); + + // Check key and value types + EXPECT_EQ(element_node->leafAt(0)->type(), ::avro::AVRO_INT); + EXPECT_EQ(element_node->leafAt(1)->type(), ::avro::AVRO_STRING); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { @@ -1303,7 +1391,7 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { auto name_mapping = CreateUnionNameMapping(); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsOk()); const auto& new_node = *result; @@ -1344,7 +1432,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); @@ -1366,7 +1454,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field 'unknown_field' not found in nested mapping")); @@ -1392,7 +1480,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); } @@ -1417,7 +1505,7 @@ TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { })"; auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - auto result = CreateAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); + auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); } From c9ff700ba274f48aedbc9066970a093ead30f63f Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Tue, 5 Aug 2025 22:36:32 +0800 Subject: [PATCH 15/15] update code --- src/iceberg/avro/avro_schema_util.cc | 151 +++++++++++---------------- test/avro_schema_test.cc | 84 ++------------- 2 files changed, 66 insertions(+), 169 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index e30bdbf3..ad59e348 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -803,40 +803,37 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig } } - if (nested_field) { - // Check if field_id is present - if (!nested_field->field_id.has_value()) { - return InvalidSchema("Field ID is missing for field '{}' in nested mapping", - field_name); - } + if (!nested_field) { + return InvalidSchema("Field '{}' not found in nested mapping", field_name); + } - // Preserve existing custom attributes for this field - ::avro::CustomAttributes attributes; - if (i < original_node->customAttributes()) { - // Copy all existing attributes from the original node - const auto& original_attrs = original_node->customAttributesAt(i); - const auto& existing_attrs = original_attrs.attributes(); - for (const auto& attr_pair : existing_attrs) { - // Copy each existing attribute to preserve original metadata - attributes.addAttribute(attr_pair.first, attr_pair.second, false); - } + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + for (const auto& attr_pair : original_node->customAttributesAt(i).attributes()) { + // Copy each existing attribute to preserve original metadata + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); } + } - // Add field ID attribute to the new node (preserving existing attributes) - attributes.addAttribute(std::string(kFieldIdProp), - std::to_string(nested_field->field_id.value()), false); + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), + /*addQuote=*/false); - new_record_node->addCustomAttributesForField(attributes); + new_record_node->addCustomAttributesForField(attributes); - // Recursively apply field IDs to nested fields - ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, - MakeAvroNodeWithFieldIds(field_node, *nested_field)); - new_record_node->addName(field_name); - new_record_node->addLeaf(new_nested_node); - } else { - // If no nested field found, this is an error - return InvalidSchema("Field '{}' not found in nested mapping", field_name); - } + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + MakeAvroNodeWithFieldIds(field_node, *nested_field)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); } return new_record_node; @@ -858,39 +855,29 @@ Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& origi return new_array_node; } - // For regular arrays, try to find element field ID from nested mapping - const MappedField* element_field = nullptr; - if (field.nested_mapping) { - auto fields_span = field.nested_mapping->fields(); - for (const auto& f : fields_span) { - if (f.names.find(std::string(kElement)) != f.names.end()) { - element_field = &f; - break; - } - } + // For regular arrays, use the first field from nested mapping as element field + if (!field.nested_mapping || field.nested_mapping->fields().empty()) { + return InvalidSchema("Array type requires nested mapping with element field"); } - if (element_field) { - // Check if field_id is present - if (!element_field->field_id.has_value()) { - return InvalidSchema("Field ID is missing for element field in array"); - } + const auto& element_field = field.nested_mapping->fields()[0]; - ICEBERG_ASSIGN_OR_RAISE( - auto new_element_node, - MakeAvroNodeWithFieldIds(original_node->leafAt(0), *element_field)); - new_array_node->addLeaf(new_element_node); - - // Add element field ID as custom attribute - ::avro::CustomAttributes element_attributes; - element_attributes.addAttribute(std::string(kFieldIdProp), - std::to_string(*element_field->field_id), false); - new_array_node->addCustomAttributesForField(element_attributes); - } else { - // If no element field found, this is an error - return InvalidSchema("Element field not found in nested mapping for array"); + if (!element_field.field_id.has_value()) { + return InvalidSchema("Field ID is missing for element field in array"); } + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), element_field)); + new_array_node->addLeaf(new_element_node); + + // Add element field ID as custom attribute + ::avro::CustomAttributes element_attributes; + element_attributes.addAttribute(std::string(kElementIdProp), + std::to_string(*element_field.field_id), + /*addQuote=*/false); + new_array_node->addCustomAttributesForField(element_attributes); + return new_array_node; } @@ -908,45 +895,25 @@ Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& origina return InvalidSchema("Map type requires nested mapping for key and value fields"); } - // Find key and value field mappings by name - std::optional key_id = field.nested_mapping->Id("key"); - std::optional value_id = field.nested_mapping->Id("value"); - - if (!key_id || !value_id) { - return InvalidSchema("Map type requires both 'key' and 'value' field mappings"); - } - - std::optional key_field_ref = field.nested_mapping->Field(*key_id); - std::optional value_field_ref = - field.nested_mapping->Field(*value_id); - - if (!key_field_ref || !value_field_ref) { - return InvalidSchema("Map type requires both key and value field mappings"); + // For map types, use the first two fields from nested mapping as key and value + if (!field.nested_mapping || field.nested_mapping->fields().size() < 2) { + return InvalidSchema("Map type requires nested mapping with key and value fields"); } - const auto& key_mapped_field = key_field_ref->get(); - const auto& value_mapped_field = value_field_ref->get(); + const auto& key_mapped_field = field.nested_mapping->fields()[0]; + const auto& value_mapped_field = field.nested_mapping->fields()[1]; if (!key_mapped_field.field_id || !value_mapped_field.field_id) { return InvalidSchema("Map key and value fields must have field IDs"); } - // Create key field with mapped field ID - MappedField key_field; - key_field.field_id = *key_mapped_field.field_id; - key_field.nested_mapping = key_mapped_field.nested_mapping; - - // Create value field with mapped field ID - MappedField value_field; - value_field.field_id = *value_mapped_field.field_id; - value_field.nested_mapping = value_mapped_field.nested_mapping; - // Add key and value nodes - ICEBERG_ASSIGN_OR_RAISE(auto new_key_node, - MakeAvroNodeWithFieldIds(original_node->leafAt(0), key_field)); + ICEBERG_ASSIGN_OR_RAISE( + auto new_key_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), key_mapped_field)); ICEBERG_ASSIGN_OR_RAISE( auto new_value_node, - MakeAvroNodeWithFieldIds(original_node->leafAt(1), value_field)); + MakeAvroNodeWithFieldIds(original_node->leafAt(1), value_mapped_field)); new_map_node->addLeaf(new_key_node); new_map_node->addLeaf(new_value_node); @@ -958,19 +925,21 @@ Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& origina for (const auto& attr_pair : existing_attrs) { // Copy each existing attribute to preserve original metadata ::avro::CustomAttributes attributes; - attributes.addAttribute(attr_pair.first, attr_pair.second, false); + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); new_map_node->addCustomAttributesForField(attributes); } } ::avro::CustomAttributes key_attributes; - key_attributes.addAttribute(std::string(kFieldIdProp), - std::to_string(*key_mapped_field.field_id), false); + key_attributes.addAttribute(std::string(kKeyIdProp), + std::to_string(*key_mapped_field.field_id), + /*addQuote=*/false); new_map_node->addCustomAttributesForField(key_attributes); ::avro::CustomAttributes value_attributes; - value_attributes.addAttribute(std::string(kFieldIdProp), - std::to_string(*value_mapped_field.field_id), false); + value_attributes.addAttribute(std::string(kValueIdProp), + std::to_string(*value_mapped_field.field_id), + /*addQuote=*/false); new_map_node->addCustomAttributesForField(value_attributes); return new_map_node; diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc index 1e971ec0..6fbbb851 100644 --- a/test/avro_schema_test.cc +++ b/test/avro_schema_test.cc @@ -25,7 +25,6 @@ #include #include "iceberg/avro/avro_schema_util_internal.h" -#include "iceberg/json_internal.h" #include "iceberg/metadata_columns.h" #include "iceberg/name_mapping.h" #include "iceberg/schema.h" @@ -1274,7 +1273,8 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToArray) { // Check that array element has field ID applied const auto& element_node = array_node->leafAt(0); EXPECT_EQ(element_node->type(), ::avro::AVRO_STRING); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(array_node, /*index=*/0, /*field_id=*/20)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(array_node, /*index=*/0, /*field_id=*/20, + /*key=*/"element-id")); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { @@ -1314,8 +1314,10 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToMap) { EXPECT_EQ(map_node->leafAt(0)->type(), ::avro::AVRO_STRING); EXPECT_EQ(map_node->leafAt(1)->type(), ::avro::AVRO_STRING); ASSERT_EQ(map_node->customAttributes(), 2); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/0, /*field_id=*/30)); - ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/1, /*field_id=*/31)); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/0, /*field_id=*/30, + /*key=*/"key-id")); + ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(map_node, /*index=*/1, /*field_id=*/31, + /*key=*/"value-id")); } TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToComplexMap) { @@ -1409,7 +1411,6 @@ TEST_F(NameMappingAvroSchemaTest, ApplyNameMappingToUnion) { EXPECT_EQ(union_node->type(), ::avro::AVRO_UNION); EXPECT_EQ(union_node->leaves(), 2); - // Check that the non-null branch has field ID applied const auto& non_null_branch = union_node->leafAt(1); EXPECT_EQ(non_null_branch->type(), ::avro::AVRO_STRING); } @@ -1434,79 +1435,6 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) { auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, - HasErrorMessage("Field ID is missing for field 'name' in nested mapping")); -} - -TEST_F(NameMappingAvroSchemaTest, MissingFieldError) { - // Create a name mapping - auto name_mapping = CreateSimpleNameMapping(); - - // Create an Avro record schema with a field not in the mapping - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "name", "type": "string"}, - {"name": "unknown_field", "type": "int"} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, - HasErrorMessage("Field 'unknown_field' not found in nested mapping")); -} - -TEST_F(NameMappingAvroSchemaTest, MissingArrayElementError) { - // Create a name mapping without array element mapping - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - auto name_mapping = NameMapping::Make(std::move(fields)); - - // Create an Avro array schema - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "items", "type": { - "type": "array", - "items": "string" - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, HasErrorMessage("Field 'items' not found in nested mapping")); } -TEST_F(NameMappingAvroSchemaTest, MissingMapKeyValueError) { - // Create a name mapping without map key/value mapping - std::vector fields; - fields.emplace_back(MappedField{.names = {"id"}, .field_id = 1}); - auto name_mapping = NameMapping::Make(std::move(fields)); - - // Create an Avro map schema - std::string avro_schema_json = R"({ - "type": "record", - "name": "test_record", - "fields": [ - {"name": "id", "type": "int"}, - {"name": "properties", "type": { - "type": "map", - "values": "string" - }} - ] - })"; - auto avro_schema = ::avro::compileJsonSchemaFromString(avro_schema_json); - - auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema)); - ASSERT_THAT(result, HasErrorMessage("Field 'properties' not found in nested mapping")); -} } // namespace iceberg::avro