From bc75e89151268d4c4636665b4587b9642e592a6e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 7 Oct 2025 13:32:48 -0300 Subject: [PATCH 01/17] tmp --- src/Access/Common/AccessType.h | 1 + src/Interpreters/InterpreterAlterQuery.cpp | 6 + src/Parsers/ASTAlterQuery.cpp | 11 + src/Parsers/ASTAlterQuery.h | 1 + src/Parsers/CommonParsers.h | 1 + src/Parsers/ParserAlterQuery.cpp | 17 + src/Storages/IStorage.h | 9 + src/Storages/MergeTree/MergeTreeData.cpp | 36 +- src/Storages/MergeTree/MergeTreeData.h | 10 + .../MergeTree/MergeTreeExportManifest.h | 13 +- .../ReplicatedMergeTreeRestartingThread.cpp | 1 + .../ObjectStorageFilePathGenerator.h | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 20 + .../ObjectStorage/StorageObjectStorage.h | 6 + .../StorageObjectStorageCluster.cpp | 10 + .../StorageObjectStorageCluster.h | 6 + src/Storages/PartitionCommands.cpp | 11 + src/Storages/PartitionCommands.h | 1 + src/Storages/StorageReplicatedMergeTree.cpp | 350 ++++++++++++++++++ src/Storages/StorageReplicatedMergeTree.h | 7 +- 20 files changed, 510 insertions(+), 9 deletions(-) diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index de3f2e18136b..b948aa80afcb 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -75,6 +75,7 @@ enum class AccessType : uint8_t M(ALTER_SETTINGS, "ALTER SETTING, ALTER MODIFY SETTING, MODIFY SETTING, RESET SETTING", TABLE, ALTER_TABLE) /* allows to execute ALTER MODIFY SETTING */\ M(ALTER_MOVE_PARTITION, "ALTER MOVE PART, MOVE PARTITION, MOVE PART", TABLE, ALTER_TABLE) \ M(ALTER_EXPORT_PART, "ALTER EXPORT PART, EXPORT PART", TABLE, ALTER_TABLE) \ + M(ALTER_EXPORT_PARTITION, "ALTER EXPORT PARTITION, EXPORT PARTITION", TABLE, ALTER_TABLE) \ M(ALTER_FETCH_PARTITION, "ALTER FETCH PART, FETCH PARTITION", TABLE, ALTER_TABLE) \ M(ALTER_FREEZE_PARTITION, "FREEZE PARTITION, UNFREEZE", TABLE, ALTER_TABLE) \ M(ALTER_UNLOCK_SNAPSHOT, "UNLOCK SNAPSHOT", TABLE, ALTER_TABLE) \ diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 83a8691cf4ee..d5ac84c9bd6d 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -508,6 +508,12 @@ AccessRightsElements InterpreterAlterQuery::getRequiredAccessForCommand(const AS required_access.emplace_back(AccessType::INSERT, command.to_database, command.to_table); break; } + case ASTAlterCommand::EXPORT_PARTITION: + { + required_access.emplace_back(AccessType::ALTER_EXPORT_PARTITION, command.to_database, command.to_table); + required_access.emplace_back(AccessType::INSERT, command.to_database, command.to_table); + break; + } case ASTAlterCommand::FETCH_PARTITION: { required_access.emplace_back(AccessType::ALTER_FETCH_PARTITION, database, table); diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index 78ba7f14d645..dbcd17dc79e0 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -378,6 +378,17 @@ void ASTAlterCommand::formatImpl(WriteBuffer & ostr, const FormatSettings & sett } } + else if (type == ASTAlterCommand::EXPORT_PARTITION) + { + ostr << (settings.hilite ? hilite_keyword : "") << "EXPORT PARTITION " << (settings.hilite ? hilite_none : ""); + partition->format(ostr, settings, state, frame); + ostr << " TO TABLE "; + if (!to_database.empty()) + { + ostr << (settings.hilite ? hilite_identifier : "") << backQuoteIfNeed(to_database) << (settings.hilite ? hilite_none : "") << "."; + } + ostr << (settings.hilite ? hilite_identifier : "") << backQuoteIfNeed(to_table) << (settings.hilite ? hilite_none : ""); + } else if (type == ASTAlterCommand::REPLACE_PARTITION) { ostr << (settings.hilite ? hilite_keyword : "") << (replace ? "REPLACE" : "ATTACH") << " PARTITION " diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index d8d502cb87c6..7683b2e11c3d 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -72,6 +72,7 @@ class ASTAlterCommand : public IAST UNFREEZE_PARTITION, UNFREEZE_ALL, EXPORT_PART, + EXPORT_PARTITION, DELETE, UPDATE, diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index f0ba30ff1c6b..a0a59288b4a3 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -332,6 +332,7 @@ namespace DB MR_MACROS(MOVE_PART, "MOVE PART") \ MR_MACROS(MOVE_PARTITION, "MOVE PARTITION") \ MR_MACROS(EXPORT_PART, "EXPORT PART") \ + MR_MACROS(EXPORT_PARTITION, "EXPORT PARTITION") \ MR_MACROS(MOVE, "MOVE") \ MR_MACROS(MS, "MS") \ MR_MACROS(MUTATION, "MUTATION") \ diff --git a/src/Parsers/ParserAlterQuery.cpp b/src/Parsers/ParserAlterQuery.cpp index 289385a20eb2..cb56cb87baf9 100644 --- a/src/Parsers/ParserAlterQuery.cpp +++ b/src/Parsers/ParserAlterQuery.cpp @@ -83,6 +83,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_move_partition(Keyword::MOVE_PARTITION); ParserKeyword s_move_part(Keyword::MOVE_PART); ParserKeyword s_export_part(Keyword::EXPORT_PART); + ParserKeyword s_export_partition(Keyword::EXPORT_PARTITION); ParserKeyword s_drop_detached_partition(Keyword::DROP_DETACHED_PARTITION); ParserKeyword s_drop_detached_part(Keyword::DROP_DETACHED_PART); ParserKeyword s_fetch_partition(Keyword::FETCH_PARTITION); @@ -553,6 +554,22 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; command->move_destination_type = DataDestinationType::TABLE; } + else if (s_export_partition.ignore(pos, expected)) + { + if (!parser_partition.parse(pos, command_partition, expected)) + return false; + + command->type = ASTAlterCommand::EXPORT_PARTITION; + + if (!s_to_table.ignore(pos, expected)) + { + return false; + } + + if (!parseDatabaseAndTableName(pos, expected, command->to_database, command->to_table)) + return false; + command->move_destination_type = DataDestinationType::TABLE; + } else if (s_move_partition.ignore(pos, expected)) { if (!parser_partition.parse(pos, command_partition, expected)) diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 19a30d952c8f..95d5a0c0e1a5 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -458,6 +458,15 @@ It is currently only implemented in StorageObjectStorage. { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Import is not implemented for storage {}", getName()); } + + virtual void commitExportPartitionTransaction( + const String & /* transaction_id */, + const String & /* partition_id */, + const Strings & /* exported_paths */, + ContextPtr /* local_context */) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "commitExportPartitionTransaction is not implemented for storage type {}", getName()); + } /** Writes the data to a table in distributed manner. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index bdceeca0abdc..8e4ecd57127c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5906,10 +5906,20 @@ void MergeTreeData::exportPartToTable(const PartitionCommand & command, ContextP "Exporting merge tree part is experimental. Set `allow_experimental_export_merge_tree_part` to enable it"); } - String dest_database = query_context->resolveDatabase(command.to_database); - auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, command.to_table}, query_context); + auto part_name = command.partition->as().value.safeGet(); - if (dest_storage->getStorageID() == this->getStorageID()) + exportPartToTable(part_name, StorageID{command.to_database, command.to_table}, query_context); +} + +void MergeTreeData::exportPartToTable( + const std::string & part_name, + const StorageID & destination_storage_id, + ContextPtr query_context, + std::function completion_callback) +{ + auto dest_storage = DatabaseCatalog::instance().getTable(destination_storage_id, query_context); + + if (destination_storage_id == this->getStorageID()) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Exporting to the same table is not allowed"); } @@ -5931,8 +5941,6 @@ void MergeTreeData::exportPartToTable(const PartitionCommand & command, ContextP if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); - auto part_name = command.partition->as().value.safeGet(); - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (!part) @@ -5944,7 +5952,8 @@ void MergeTreeData::exportPartToTable(const PartitionCommand & command, ContextP dest_storage->getStorageID(), part, query_context->getSettingsRef()[Setting::export_merge_tree_part_overwrite_file_if_exists], - query_context->getSettingsRef()[Setting::output_format_parallel_formatting]); + query_context->getSettingsRef()[Setting::output_format_parallel_formatting], + completion_callback); std::lock_guard lock(export_manifests_mutex); @@ -6014,6 +6023,9 @@ void MergeTreeData::exportPartToTableImpl( std::lock_guard inner_lock(export_manifests_mutex); export_manifests.erase(manifest); + + if (manifest.completion_callback) + manifest.completion_callback({}); return; } @@ -6103,6 +6115,9 @@ void MergeTreeData::exportPartToTableImpl( ProfileEvents::increment(ProfileEvents::PartsExports); ProfileEvents::increment(ProfileEvents::PartsExportTotalMilliseconds, static_cast((*exports_list_entry)->elapsed * 1000)); + + if (manifest.completion_callback) + manifest.completion_callback({true, destination_file_path}); } catch (...) { @@ -6124,6 +6139,9 @@ void MergeTreeData::exportPartToTableImpl( export_manifests.erase(manifest); + if (manifest.completion_callback) + manifest.completion_callback({}); + throw; } } @@ -6185,6 +6203,12 @@ Pipe MergeTreeData::alterPartition( break; } + case PartitionCommand::EXPORT_PARTITION: + { + exportPartitionToTable(command, query_context); + break; + } + case PartitionCommand::DROP_DETACHED_PARTITION: dropDetached(command.partition, command.part, query_context); break; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index b9007af0e6f0..9ee5b4f18506 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -906,6 +906,16 @@ class MergeTreeData : public IStorage, public WithMutableContext void exportPartToTable(const PartitionCommand & command, ContextPtr query_context); + void exportPartToTable( + const std::string & part_name, + const StorageID & destination_storage_id, ContextPtr query_context, + std::function completion_callback = {}); + + virtual void exportPartitionToTable(const PartitionCommand &, ContextPtr) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "EXPORT PARTITION is not implemented"); + } + void exportPartToTableImpl( const MergeTreeExportManifest & manifest, ContextPtr local_context); diff --git a/src/Storages/MergeTree/MergeTreeExportManifest.h b/src/Storages/MergeTree/MergeTreeExportManifest.h index 36831fd132ba..0ad93b41798b 100644 --- a/src/Storages/MergeTree/MergeTreeExportManifest.h +++ b/src/Storages/MergeTree/MergeTreeExportManifest.h @@ -8,16 +8,24 @@ struct MergeTreeExportManifest { using DataPartPtr = std::shared_ptr; + struct CompletionCallbackResult + { + bool success = false; + String relative_path_in_destination_storage; + }; + MergeTreeExportManifest( const StorageID & destination_storage_id_, const DataPartPtr & data_part_, bool overwrite_file_if_exists_, - bool parallel_formatting_) + bool parallel_formatting_, + std::function completion_callback_ = {}) : destination_storage_id(destination_storage_id_), data_part(data_part_), overwrite_file_if_exists(overwrite_file_if_exists_), parallel_formatting(parallel_formatting_), + completion_callback(completion_callback_), create_time(time(nullptr)) {} StorageID destination_storage_id; @@ -25,6 +33,9 @@ struct MergeTreeExportManifest bool overwrite_file_if_exists; bool parallel_formatting; + + std::function completion_callback; + time_t create_time; mutable bool in_progress = false; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp index 8420cd5738c2..0009a6a7199e 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp @@ -174,6 +174,7 @@ bool ReplicatedMergeTreeRestartingThread::runImpl() storage.cleanup_thread.start(); storage.async_block_ids_cache.start(); storage.part_check_thread.start(); + storage.export_merge_tree_partition_select_task->activateAndSchedule(); LOG_DEBUG(log, "Table started successfully"); return true; diff --git a/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h b/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h index 1ed503cbf3c6..7eb22a9960b8 100644 --- a/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h +++ b/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h @@ -51,7 +51,7 @@ namespace DB result += raw_path; - if (raw_path.back() != '/') + if (!result.empty() && result.back() != '/') { result += "/"; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index d256fa36f00e..99330e9720e2 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -486,6 +486,26 @@ SinkToStoragePtr StorageObjectStorage::import( local_context); } +void StorageObjectStorage::commitExportPartitionTransaction(const String & transaction_id, const String & partition_id, const Strings & exported_paths, ContextPtr local_context) +{ + const String commit_object = configuration->getRawPath().path + "/commit_" + partition_id + "_" + transaction_id; + + /// if file already exists, nothing to be done + if (object_storage->exists(StoredObject(commit_object))) + { + LOG_DEBUG(getLogger("StorageObjectStorage"), "Commit file already exists, nothing to be done: {}", commit_object); + return; + } + + auto out = object_storage->writeObject(StoredObject(commit_object), WriteMode::Rewrite, /* attributes= */ {}, DBMS_DEFAULT_BUFFER_SIZE, local_context->getWriteSettings()); + for (const auto & p : exported_paths) + { + out->write(p.data(), p.size()); + out->write("\n", 1); + } + out->finalize(); +} + void StorageObjectStorage::truncate( const ASTPtr & /* query */, const StorageMetadataPtr & /* metadata_snapshot */, diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index f12c6bd6077f..c9999170a343 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -110,6 +110,12 @@ class StorageObjectStorage : public IStorage bool /* overwrite_if_exists */, ContextPtr /* context */) override; + void commitExportPartitionTransaction( + const String & transaction_id, + const String & partition_id, + const Strings & exported_paths, + ContextPtr local_context) override; + void truncate( const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 03fcd9962c4a..8b48fcaf64b8 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -622,5 +622,15 @@ SinkToStoragePtr StorageObjectStorageCluster::import( return IStorageCluster::import(file_name, block_with_partition_values, destination_file_path, overwrite_if_exists, context); } +void StorageObjectStorageCluster::commitExportPartitionTransaction( + const String & transaction_id, + const String & partition_id, + const Strings & exported_paths, + ContextPtr local_context) +{ + if (pure_storage) + return pure_storage->commitExportPartitionTransaction(transaction_id, partition_id, exported_paths, local_context); + return IStorageCluster::commitExportPartitionTransaction(transaction_id, partition_id, exported_paths, local_context); +} } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 374d35877048..5b2cc5e92ca2 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -67,6 +67,12 @@ class StorageObjectStorageCluster : public IStorageCluster bool /* overwrite_if_exists */, ContextPtr /* context */) override; + void commitExportPartitionTransaction( + const String & transaction_id, + const String & partition_id, + const Strings & exported_paths, + ContextPtr local_context) override; + private: void updateQueryToSendIfNeeded( ASTPtr & query, diff --git a/src/Storages/PartitionCommands.cpp b/src/Storages/PartitionCommands.cpp index 05fb3a639bb8..1cd75827682e 100644 --- a/src/Storages/PartitionCommands.cpp +++ b/src/Storages/PartitionCommands.cpp @@ -140,6 +140,15 @@ std::optional PartitionCommand::parse(const ASTAlterCommand * res.to_table = command_ast->to_table; return res; } + if (command_ast->type == ASTAlterCommand::EXPORT_PARTITION) + { + PartitionCommand res; + res.type = EXPORT_PARTITION; + res.partition = command_ast->partition->clone(); + res.to_database = command_ast->to_database; + res.to_table = command_ast->to_table; + return res; + } return {}; } @@ -183,6 +192,8 @@ std::string PartitionCommand::typeToString() const return "REPLACE PARTITION"; case PartitionCommand::Type::EXPORT_PART: return "EXPORT PART"; + case PartitionCommand::Type::EXPORT_PARTITION: + return "EXPORT PARTITION"; default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Uninitialized partition command"); } diff --git a/src/Storages/PartitionCommands.h b/src/Storages/PartitionCommands.h index 15d2a7fb869f..e3f36d0e7c1f 100644 --- a/src/Storages/PartitionCommands.h +++ b/src/Storages/PartitionCommands.h @@ -34,6 +34,7 @@ struct PartitionCommand UNFREEZE_PARTITION, REPLACE_PARTITION, EXPORT_PART, + EXPORT_PARTITION, }; Type type = UNKNOWN; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 956e8e8913b7..5bf993b11a6a 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -113,6 +113,9 @@ #include #include +#include +#include +#include #include #include @@ -178,6 +181,7 @@ namespace Setting extern const SettingsSeconds receive_timeout; extern const SettingsInt64 replication_wait_for_inactive_replica_timeout; extern const SettingsUInt64 select_sequential_consistency; + extern const SettingsBool allow_experimental_export_merge_tree_part; } namespace MergeTreeSetting @@ -441,6 +445,11 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( mutations_finalizing_task = getContext()->getSchedulePool().createTask( getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsFinalizingTask)", [this] { mutationsFinalizingTask(); }); + export_merge_tree_partition_select_task = getContext()->getSchedulePool().createTask( + getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_select_task)", [this] { selectPartsToExport(); }); + + export_merge_tree_partition_select_task->deactivate(); + /// This task can be scheduled by different parts of code even when storage is readonly. /// This can lead to redundant exceptions during startup. /// Will be activated by restarting thread. @@ -4322,6 +4331,197 @@ void StorageReplicatedMergeTree::mutationsFinalizingTask() } } +// static void exportPartitionCallback( +// MergeTreeExportManifest::CompletionCallbackResult result, +// zkutil::ZooKeeperPtr & zookeeper, +// const String & partition_export, +// const String & part_name) +// { +// if (!result.success) +// { +// LOG_INFO(log, "Failed to export partition {} part {}, releasing lock", partition_export, part_name); +// zookeeper->remove(owner_path); +// return; +// } + +// zookeeper->create(part_path / "path", result.relative_path_in_destination_storage, zkutil::CreateMode::Persistent); +// LOG_INFO(log, "Exported partition {} part {} to {}", partition_export, part_name, result.relative_path_in_destination_storage); + +// /// loop over all parts again and check if they all have the path set, if so, mark the export as done +// bool completed = true; +// Strings exported_paths; +// for (const auto & possibly_commited_part : part_names) +// { +// /// todo arthur is this a case of "try to write as opposed to try to read because it might be stale?" +// const auto possibly_commited_part_path = parts_path / possibly_commited_part; +// if (!zookeeper->exists(possibly_commited_part_path / "path")) +// { +// completed = false; +// break; +// } +// exported_paths.push_back(zookeeper->get(possibly_commited_part_path / "path")); +// } + +// if (completed) +// { + +// const auto destination_storage = DatabaseCatalog::instance().tryGetTable( +// StorageID(QualifiedTableName::parseFromString(destination_storage_id)), +// getContext()); +// const auto transaction_id = zookeeper->get(partition_export_path / "transaction_id"); +// /// possibly need to add a check here to see if the table still exists +// destination_storage->commitExportPartitionTransaction(transaction_id, partition_export, exported_paths, getContext()); +// zookeeper->set(partition_export_path / "status", "COMPLETED"); +// } +// } + +void StorageReplicatedMergeTree::selectPartsToExport() +{ + auto zookeeper = getZooKeeper(); + auto exports_path = fs::path(zookeeper_path) / "exports"; + + /// grab all exports/ + Strings partition_exports; + zookeeper->tryGetChildren(exports_path, partition_exports); + + for (const auto & partition_export : partition_exports) + { + const auto partition_export_path = fs::path(exports_path) / partition_export; + const auto destination_storage_id = zookeeper->get(partition_export_path / "destination_storage_id"); + + if (destination_storage_id.empty()) + { + LOG_WARNING(log, "Failed to grab destination storage id for partition {} export", partition_export); + continue; + } + + const auto time_to_live_seconds = std::stoi(zookeeper->get(partition_export_path / "time_to_live_seconds")); + const auto create_time = std::stoi(zookeeper->get(partition_export_path / "create_time")); + + if (time(nullptr) - create_time > time_to_live_seconds) + { + LOG_INFO(log, "Partition {} export has expired, removing zk entry", partition_export); + zookeeper->tryRemoveRecursive(partition_export_path); + continue; + } + + if (zookeeper->get(partition_export_path / "status") != "IN_PROGRESS") + { + LOG_INFO(log, "Partition {} export is not in progress, skipping", partition_export); + continue; + } + + /// grab all part names, if it already has an owner, skip + /// otherwise, try to atomically claim it + const auto parts_path = partition_export_path / "parts"; + auto part_names = zookeeper->getChildren(parts_path); + + for (const auto & part_name : part_names) + { + /// The check if this replica contains the part is also performed in `exportPartToTable`. + /// Doing it here so that we don't even bother claiming the part export if we don't have it. + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); + if (!part) + { + LOG_INFO(log, "Part {} not found in the replica, skipping export", part_name); + continue; + } + + auto part_path = parts_path / part_name; + auto owner_path = part_path / "owner"; + + /// Ephemeral because it acts like a lock. AFAIK, if this node crashes, the lock will be released and another replica can claim it. + if (Coordination::Error::ZOK != zookeeper->tryCreate(owner_path, replica_name, zkutil::CreateMode::Ephemeral)) + { + LOG_INFO(log, "Export partition {} part {} already has an owner or failed to claim it, skipping", partition_export, part_name); + continue; + } + + LOG_INFO(log, "Claimed export partition {} part {}", partition_export, part_name); + + /// Two failure paths: catch block and the lambda function. The try-catch is for the case where it fails to schedule the export. + /// The lambda is for the case the export itself fails. + try + { + std::function callback = [this, partition_export, part_name, owner_path, part_path, parts_path, part_names, destination_storage_id, partition_export_path] + (MergeTreeExportManifest::CompletionCallbackResult result) + { + auto zk = getZooKeeper(); + if (!result.success) + { + LOG_INFO(log, "Failed to export part {}, releasing lock", part_name); + + const auto retry_count = std::stoi(zk->get(part_path / "retry_count")) + 1; + const auto max_retry_count = std::stoi(zk->get(part_path / "max_retry_count")); + + if (retry_count >= max_retry_count) + { + LOG_INFO(log, "Failed to export part {} after {} retries, setting status to FAILED", part_name, retry_count); + zk->set(partition_export_path / "status", "FAILED"); + } + else + { + zk->set(part_path / "retry_count", std::to_string(retry_count)); + } + + zk->remove(owner_path); + return; + } + + zk->create(part_path / "path", result.relative_path_in_destination_storage, zkutil::CreateMode::Persistent); + LOG_INFO(log, "Exported partition {} part {} to {}", partition_export, part_name, result.relative_path_in_destination_storage); + + /// loop over all parts again and check if they all have the path set, if so, mark the export as done + Strings exported_paths; + for (const auto & possibly_commited_part : part_names) + { + /// todo arthur is this a case of "try to write as opposed to try to read because it might be stale?" + const auto possibly_commited_part_path = parts_path / possibly_commited_part; + if (!zk->exists(possibly_commited_part_path / "path")) + { + break; + } + exported_paths.push_back(zk->get(possibly_commited_part_path / "path")); + } + + bool exported_all_parts = exported_paths.size() == part_names.size(); + if (!exported_all_parts) + { + return; + } + + const auto destination_storage = DatabaseCatalog::instance().tryGetTable( + StorageID(QualifiedTableName::parseFromString(destination_storage_id)), + getContext()); + const auto transaction_id = zk->get(partition_export_path / "transaction_id"); + /// possibly need to add a check here to see if the table still exists + destination_storage->commitExportPartitionTransaction(transaction_id, partition_export, exported_paths, getContext()); + zk->tryCreate(partition_export_path / "status", "COMPLETED", zkutil::CreateMode::Persistent); + }; + + exportPartToTable( + part_name, + StorageID(QualifiedTableName::parseFromString(destination_storage_id)), + getContext(), + callback); + + /// for now, export only one part at a time + break; + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + + zookeeper->remove(owner_path); + /// for now, export only one part at a time + break; + } + } + } + + export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); +} + StorageReplicatedMergeTree::CreateMergeEntryResult StorageReplicatedMergeTree::createLogEntryToMergeParts( zkutil::ZooKeeperPtr & zookeeper, @@ -7921,6 +8121,156 @@ void StorageReplicatedMergeTree::fetchPartition( LOG_TRACE(log, "Fetch took {} sec. ({} tries)", watch.elapsedSeconds(), try_no); } +void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & command, ContextPtr query_context) +{ + if (!query_context->getSettingsRef()[Setting::allow_experimental_export_merge_tree_part]) + { + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Exporting merge tree part is experimental. Set `allow_experimental_export_merge_tree_part` to enable it"); + } + + String dest_database = query_context->resolveDatabase(command.to_database); + auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, command.to_table}, query_context); + + if (dest_storage->getStorageID() == this->getStorageID()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Exporting to the same table is not allowed"); + } + + if (!dest_storage->supportsImport()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Destination storage {} does not support MergeTree parts or uses unsupported partitioning", dest_storage->getName()); + + auto query_to_string = [] (const ASTPtr & ast) + { + return ast ? ast->formatWithSecretsOneLine() : ""; + }; + + auto src_snapshot = getInMemoryMetadataPtr(); + auto destination_snapshot = dest_storage->getInMemoryMetadataPtr(); + + if (destination_snapshot->getColumns().getAllPhysical().sizeOfDifference(src_snapshot->getColumns().getAllPhysical())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); + + if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST())) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); + + zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly(); + + const String partition_id = getPartitionIDFromQuery(command.partition, query_context); + + const auto exports_path = fs::path(zookeeper_path) / "exports"; + Coordination::Requests ops; + + /// maybe this should go in initialization somewhere else + if (!zookeeper->exists(exports_path)) + { + ops.emplace_back(zkutil::makeCreateRequest(exports_path, "", zkutil::CreateMode::Persistent)); + } + + const auto partition_exports_path = exports_path / partition_id; + + /// check if entry already exists + if (zookeeper->exists(partition_exports_path)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} already exported or it is being exported", partition_id); + } + + ops.emplace_back(zkutil::makeCreateRequest(partition_exports_path, "", zkutil::CreateMode::Persistent)); + + const Strings parts = zookeeper->getChildren(fs::path(replica_path) / "parts"); + const ActiveDataPartSet active_parts_set(format_version, parts); + const auto part_infos = active_parts_set.getPartInfos(); + std::vector parts_to_export; + for (const auto & part_info : part_infos) + { + if (part_info.getPartitionId() == partition_id) + { + parts_to_export.push_back(part_info.getPartNameV1()); + } + } + + if (parts_to_export.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} doesn't exist", partition_id); + } + + /// somehow check if the list of parts is updated "enough" + + /* + table_path/exports/partition_id + table_path/exports/partition_id/transaction_id + table_path/exports/partition_id/destination_storage_id + table_path/exports/partition_id/status + table_path/exports/partition_id/create_time + table_path/exports/partition_id/time_to_live_seconds + table_path/exports/partition_id/part_name_1 + table_path/exports/partition_id/part_name_1/status + table_path/exports/partition_id/part_name_1/owner + table_path/exports/partition_id/part_name_1/retry_count + table_path/exports/partition_id/part_name_1/max_retry_count + table_path/exports/partition_id/part_name_1/path + table_path/exports/partition_id/part_name_n + ... + */ + + const auto transaction_id = std::to_string(generateSnowflakeID()); + + const auto parts_path = fs::path(partition_exports_path) / "parts"; + ops.emplace_back(zkutil::makeCreateRequest( + parts_path, "", + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "transaction_id", + transaction_id, + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "destination_storage_id", + dest_storage->getStorageID().getFullTableName(), + zkutil::CreateMode::Persistent)); + + /// status: IN_PROGRESS, COMPLETED, FAILED + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "status", + "IN_PROGRESS", + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "create_time", + std::to_string(time(nullptr)), + zkutil::CreateMode::Persistent)); + + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "time_to_live_seconds", + std::to_string(3600), + zkutil::CreateMode::Persistent)); + + for (const String & part_name : parts_to_export) + { + const String part_path = fs::path(parts_path) / part_name; + + ops.emplace_back(zkutil::makeCreateRequest(part_path, "", zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(part_path) / "retry_count", + "0", + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(part_path) / "max_retry_count", + "3", + zkutil::CreateMode::Persistent)); + } + + Coordination::Responses responses; + Coordination::Error code = zookeeper->tryMulti(ops, responses); + + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException::fromPath(code, partition_exports_path); +} + void StorageReplicatedMergeTree::forgetPartition(const ASTPtr & partition, ContextPtr query_context) { diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index a63a7ef86d4d..f730ff29098a 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -506,6 +506,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData /// A task that marks finished mutations as done. BackgroundSchedulePoolTaskHolder mutations_finalizing_task; + BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; + /// A thread that removes old parts, log entries, and blocks. ReplicatedMergeTreeCleanupThread cleanup_thread; @@ -735,6 +737,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData /// Checks if some mutations are done and marks them as done. void mutationsFinalizingTask(); + void selectPartsToExport(); + /** Write the selected parts to merge into the log, * Call when merge_selecting_mutex is locked. * Returns false if any part is not in ZK. @@ -921,7 +925,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData bool fetch_part, ContextPtr query_context) override; void forgetPartition(const ASTPtr & partition, ContextPtr query_context) override; - + + void exportPartitionToTable(const PartitionCommand &, ContextPtr) override; /// NOTE: there are no guarantees for concurrent merges. Dropping part can /// be concurrently merged into some covering part and dropPart will do From 9fa5ab029b90a246cd28771b7e937070915bc547 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 9 Oct 2025 10:43:50 -0300 Subject: [PATCH 02/17] tmp2 - just in case of disaster recovery --- ...portReplicatedMergeTreePartitionManifest.h | 68 ++ ...ortReplicatedMergeTreePartitionTaskEntry.h | 13 + .../ReplicatedMergeTreeRestartingThread.cpp | 1 + src/Storages/StorageReplicatedMergeTree.cpp | 601 +++++++++++++----- src/Storages/StorageReplicatedMergeTree.h | 12 + 5 files changed, 541 insertions(+), 154 deletions(-) create mode 100644 src/Storages/ExportReplicatedMergeTreePartitionManifest.h create mode 100644 src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h new file mode 100644 index 000000000000..11bdb45e5bc3 --- /dev/null +++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +struct ExportReplicatedMergeTreePartitionManifest +{ + String transaction_id; + String partition_id; + String destination_database; + String destination_table; + String source_replica; + size_t number_of_parts; + std::vector parts; + time_t create_time; + + std::string toJsonString() const + { + Poco::JSON::Object json; + json.set("transaction_id", transaction_id); + json.set("partition_id", partition_id); + json.set("destination_database", destination_database); + json.set("destination_table", destination_table); + json.set("source_replica", source_replica); + json.set("number_of_parts", number_of_parts); + + Poco::JSON::Array::Ptr parts_array = new Poco::JSON::Array(); + for (const auto & part : parts) + parts_array->add(part); + json.set("parts", parts_array); + + json.set("create_time", create_time); + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); + } + + static ExportReplicatedMergeTreePartitionManifest fromJsonString(const std::string & json_string) + { + Poco::JSON::Parser parser; + auto json = parser.parse(json_string).extract(); + chassert(json); + + ExportReplicatedMergeTreePartitionManifest manifest; + manifest.transaction_id = json->getValue("transaction_id"); + manifest.partition_id = json->getValue("partition_id"); + manifest.destination_database = json->getValue("destination_database"); + manifest.destination_table = json->getValue("destination_table"); + manifest.source_replica = json->getValue("source_replica"); + manifest.number_of_parts = json->getValue("number_of_parts"); + + auto parts_array = json->getArray("parts"); + for (size_t i = 0; i < parts_array->size(); ++i) + manifest.parts.push_back(parts_array->getElement(static_cast(i))); + + manifest.create_time = json->getValue("create_time"); + return manifest; + } +}; + +} diff --git a/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h b/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h new file mode 100644 index 000000000000..122c29b1b244 --- /dev/null +++ b/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace DB +{ +struct ExportReplicatedMergeTreePartitionTaskEntry +{ + ExportReplicatedMergeTreePartitionManifest manifest; + + std::size_t parts_to_do; +}; +} diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp index 0009a6a7199e..c7b183ea0895 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp @@ -174,6 +174,7 @@ bool ReplicatedMergeTreeRestartingThread::runImpl() storage.cleanup_thread.start(); storage.async_block_ids_cache.start(); storage.part_check_thread.start(); + storage.export_merge_tree_partition_updating_task->activateAndSchedule(); storage.export_merge_tree_partition_select_task->activateAndSchedule(); LOG_DEBUG(log, "Table started successfully"); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 5bf993b11a6a..f8482ecfe4a0 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7,6 +7,7 @@ #include #include +#include "Common/ZooKeeper/IKeeper.h" #include #include #include @@ -113,6 +114,9 @@ #include #include +#include "Interpreters/StorageID.h" +#include "Storages/ExportReplicatedMergeTreePartitionManifest.h" +#include "Storages/ExportReplicatedMergeTreePartitionTaskEntry.h" #include #include #include @@ -445,16 +449,23 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( mutations_finalizing_task = getContext()->getSchedulePool().createTask( getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::mutationsFinalizingTask)", [this] { mutationsFinalizingTask(); }); - export_merge_tree_partition_select_task = getContext()->getSchedulePool().createTask( - getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_select_task)", [this] { selectPartsToExport(); }); - - export_merge_tree_partition_select_task->deactivate(); - /// This task can be scheduled by different parts of code even when storage is readonly. /// This can lead to redundant exceptions during startup. /// Will be activated by restarting thread. mutations_finalizing_task->deactivate(); + export_merge_tree_partition_updating_task = getContext()->getSchedulePool().createTask( + getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_updating_task)", [this] { exportMergeTreePartitionUpdatingTask(); }); + + export_merge_tree_partition_updating_task->deactivate(); + + export_merge_tree_partition_watch_callback = std::make_shared(export_merge_tree_partition_updating_task->getWatchCallback()); + + export_merge_tree_partition_select_task = getContext()->getSchedulePool().createTask( + getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_select_task)", [this] { selectPartsToExport(); }); + + export_merge_tree_partition_select_task->deactivate(); + bool has_zookeeper = getContext()->hasZooKeeper() || getContext()->hasAuxiliaryZooKeeper(zookeeper_info.zookeeper_name); if (has_zookeeper) { @@ -1037,6 +1048,8 @@ bool StorageReplicatedMergeTree::createTableIfNotExistsAttempt(const StorageMeta zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/mutations", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/exports", "", + zkutil::CreateMode::Persistent)); /// And create first replica atomically. See also "createReplica" method that is used to create not the first replicas. @@ -4375,147 +4388,415 @@ void StorageReplicatedMergeTree::mutationsFinalizingTask() // } // } -void StorageReplicatedMergeTree::selectPartsToExport() +void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() { - auto zookeeper = getZooKeeper(); - auto exports_path = fs::path(zookeeper_path) / "exports"; + std::lock_guard lock(export_merge_tree_partition_mutex); + + auto zk = getZooKeeper(); + + const auto exports_path = fs::path(zookeeper_path) / "exports"; + /// why do I need the stat? + Coordination::Stat stat; + const auto partition_exports_in_zk = zk->getChildrenWatch(exports_path, &stat, export_merge_tree_partition_watch_callback); + const auto partition_exports_in_zk_set = std::unordered_set(partition_exports_in_zk.begin(), partition_exports_in_zk.end()); - /// grab all exports/ - Strings partition_exports; - zookeeper->tryGetChildren(exports_path, partition_exports); + std::unordered_set local_export_partition_entries; + for (const auto & entry : export_merge_tree_partition_task_entries) + local_export_partition_entries.insert(entry.first); - for (const auto & partition_export : partition_exports) + std::unordered_set new_entries; + std::unordered_set removed_entries; + + for (const auto & partition_export_task_entry : local_export_partition_entries) + { + if (!partition_exports_in_zk_set.contains(partition_export_task_entry)) + removed_entries.insert(partition_export_task_entry); + } + + for (const auto & partition_export : partition_exports_in_zk_set) { - const auto partition_export_path = fs::path(exports_path) / partition_export; - const auto destination_storage_id = zookeeper->get(partition_export_path / "destination_storage_id"); + if (!local_export_partition_entries.contains(partition_export)) + new_entries.insert(partition_export); + } + + /// remove the removed entries from the local set + for (const auto & entry : removed_entries) + export_merge_tree_partition_task_entries.erase(entry); - if (destination_storage_id.empty()) + // add the new entries to the local set + for (const auto & entry_key : new_entries) + { + // get entry from zk + const auto entry_path = fs::path(exports_path) / entry_key; + const auto metadata_entry_path = fs::path(entry_path) / "metadata.json"; + const auto parts_to_do_path = fs::path(entry_path) / "parts_to_do"; + + std::string parts_to_do_string; + if (!zk->tryGet(parts_to_do_path, parts_to_do_string)) { - LOG_WARNING(log, "Failed to grab destination storage id for partition {} export", partition_export); + LOG_INFO(log, "Skipping..."); continue; } - const auto time_to_live_seconds = std::stoi(zookeeper->get(partition_export_path / "time_to_live_seconds")); - const auto create_time = std::stoi(zookeeper->get(partition_export_path / "create_time")); + const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); - if (time(nullptr) - create_time > time_to_live_seconds) + if (parts_to_do == 0) { - LOG_INFO(log, "Partition {} export has expired, removing zk entry", partition_export); - zookeeper->tryRemoveRecursive(partition_export_path); + LOG_INFO(log, "Skipping... Parts to do is 0, will not load it"); continue; } - if (zookeeper->get(partition_export_path / "status") != "IN_PROGRESS") + std::string metadata_json; + if (!zk->tryGet(metadata_entry_path, metadata_json)) { - LOG_INFO(log, "Partition {} export is not in progress, skipping", partition_export); + LOG_INFO(log, "Skipping..."); continue; } - /// grab all part names, if it already has an owner, skip - /// otherwise, try to atomically claim it - const auto parts_path = partition_export_path / "parts"; - auto part_names = zookeeper->getChildren(parts_path); + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); - for (const auto & part_name : part_names) + export_merge_tree_partition_task_entries[entry_key] = ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do}; + } +} + +void StorageReplicatedMergeTree::selectPartsToExport() +{ + const auto exports_path = fs::path(zookeeper_path) / "exports"; + + auto complete_part_export = [&]( + const std::string & export_partition_path, + const std::string & part_status_path, + const std::string & parts_to_do_path, + const std::string & lock_path, + const std::string & next_idx_path, + const std::size_t next_idx_local, + const ZooKeeperPtr & zk) -> bool + { + /// todo arthur is it possible to grab stats using a multi-op? + Coordination::Stat parts_to_do_stat; + Coordination::Stat lock_stat; + Coordination::Stat next_idx_stat; + std::string parts_to_do_string; + + int retries = 0; + const int max_retries = 3; + while (retries < max_retries) { - /// The check if this replica contains the part is also performed in `exportPartToTable`. - /// Doing it here so that we don't even bother claiming the part export if we don't have it. - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); + if (!zk->tryGet(parts_to_do_path, parts_to_do_string, &parts_to_do_stat)) + { + LOG_INFO(log, "Failed to get parts_to_do, skipping"); + return false; + } + + std::string locked_by; + + if (!zk->tryGet(lock_path, locked_by, &lock_stat)) + { + LOG_INFO(log, "Failed to get locked_by, skipping"); + return false; + } + + if (locked_by != replica_name) + { + LOG_INFO(log, "Skipping... Locked by {}, not by {}", locked_by, replica_name); + return false; + } + + std::string next_idx_string; + if (!zk->tryGet(next_idx_path, next_idx_string, &next_idx_stat)) + { + LOG_INFO(log, "Failed to get next_idx, skipping"); + return false; + } + + const std::size_t next_idx_zk = std::stoull(next_idx_string.c_str()); + + std::size_t parts_to_do = std::stoull(parts_to_do_string.c_str()); + + if (parts_to_do == 0) + { + LOG_INFO(log, "Skipping... Parts to do is 0, maybe someone else already completed it? that sounds weird"); + return false; + } + + parts_to_do--; + + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCheckRequest(lock_path, lock_stat.version)); + ops.emplace_back(zkutil::makeCheckRequest(parts_to_do_path, parts_to_do_stat.version)); + ops.emplace_back(zkutil::makeCheckRequest(next_idx_path, next_idx_stat.version)); + ops.emplace_back(zkutil::makeSetRequest(next_idx_path, std::to_string(std::max(next_idx_zk, next_idx_local + 1)), next_idx_stat.version)); + ops.emplace_back(zkutil::makeSetRequest(part_status_path, "COMPLETED", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(lock_path, lock_stat.version)); + ops.emplace_back(zkutil::makeSetRequest(parts_to_do_path, std::to_string(parts_to_do), parts_to_do_stat.version)); + + if (parts_to_do == 0) + { + ops.emplace_back(zkutil::makeSetRequest(fs::path(export_partition_path) / "status", "COMPLETED", -1)); + } + + Coordination::Responses responses; + if (zk->tryMulti(ops, responses) == Coordination::Error::ZOK) + { + return true; + } + + retries++; + } + + return false; + }; + + auto lock_part = [&]( + const std::string & export_partition_path, + const std::string & part_path, + const std::string & status_path, + const std::string & lock_path, + const std::string & node_name, + const ZooKeeperPtr & zk) -> bool + { + Coordination::Requests ops; + + /// if the part path exists, it can be one of the following: + /// 1. PENDING and Locked - Some replica is working on it - we should skip + /// 2. PENDING and unlocked - Whoever was working on it died - we should acquire the lock + /// 3. COMPLETED and unlocked - We should skip + if (zk->exists(part_path)) + { + Coordination::Stat stat; + std::string status; + + if (zk->tryGet(status_path, status, &stat)) + { + if (status != "PENDING") + { + LOG_INFO(log, "Skipping... Status is not PENDING"); + return false; + } + + std::string parts_to_do_zk; + if (!zk->tryGet(fs::path(export_partition_path) / "parts_to_do", parts_to_do_zk)) + { + LOG_INFO(log, "Failed to get parts_to_do, skipping"); + return false; + } + + const auto parts_to_do = std::stoull(parts_to_do_zk.c_str()); + + if (parts_to_do == 0) + { + LOG_INFO(log, "Skipping... Parts to do is 0, maybe someone else already completed it?"); + return false; + } + + /// only try to lock it if the status is still PENDING + /// if we did not check for status = pending, chances are some other replica completed and released the lock in the meantime + ops.emplace_back(zkutil::makeCheckRequest(status_path, stat.version)); + /// todo do I need to "re-set" the status to PENDING? I don't think so. + ops.emplace_back(zkutil::makeCreateRequest(lock_path, node_name, zkutil::CreateMode::Ephemeral)); + /// no need to update retry count here. + } + else + { + LOG_INFO(log, "Skipping... Failed to get status, probably killed"); + return false; + } + } + else + { + /// if the node does not exist, just create everything from scratch. + ops.emplace_back(zkutil::makeCreateRequest(part_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(status_path, "PENDING", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(lock_path, node_name, zkutil::CreateMode::Ephemeral)); + ops.emplace_back(zkutil::makeCreateRequest(fs::path(part_path) / "retry_count", "0", zkutil::CreateMode::Persistent)); + } + + Coordination::Responses responses; + + return zk->tryMulti(ops, responses) == Coordination::Error::ZOK; + }; + + auto try_to_acquire_a_part = [&]( + const ExportReplicatedMergeTreePartitionManifest & manifest, + const std::string & partition_export_path, + const std::size_t next_idx, + const ZooKeeperPtr & zk) -> std::optional + { + for (auto i = next_idx; i < manifest.number_of_parts; i++) + { + const auto part_path = fs::path(partition_export_path) / "parts" / manifest.parts[i]; + const auto lock_path = fs::path(part_path) / "lock"; + const auto status_path = fs::path(part_path) / "status"; + + const auto part = getPartIfExists(manifest.parts[i], {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (!part) { - LOG_INFO(log, "Part {} not found in the replica, skipping export", part_name); + LOG_INFO(log, "Skipping... Part {} not found locally", manifest.parts[i]); continue; } - auto part_path = parts_path / part_name; - auto owner_path = part_path / "owner"; + if (lock_part(partition_export_path, part_path, status_path, lock_path, replica_name, zk)) + { + return manifest.parts[i]; + } + } + + /// failed to lock a part in the range of `next_idx...number_of_parts` + /// now try the full scan `0...next_idx` + for (auto i = 0u; i < next_idx; i++) + { + const auto part_path = fs::path(partition_export_path) / "parts" / manifest.parts[i]; + const auto lock_path = fs::path(part_path) / "lock"; + const auto status_path = fs::path(part_path) / "status"; - /// Ephemeral because it acts like a lock. AFAIK, if this node crashes, the lock will be released and another replica can claim it. - if (Coordination::Error::ZOK != zookeeper->tryCreate(owner_path, replica_name, zkutil::CreateMode::Ephemeral)) + const auto part = getPartIfExists(manifest.parts[i], {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); + if (!part) { - LOG_INFO(log, "Export partition {} part {} already has an owner or failed to claim it, skipping", partition_export, part_name); + LOG_INFO(log, "Skipping... Part {} not found locally", manifest.parts[i]); continue; } - LOG_INFO(log, "Claimed export partition {} part {}", partition_export, part_name); + if (lock_part(partition_export_path, part_path, status_path, lock_path, replica_name, zk)) + { + return manifest.parts[i]; + } + } + + return std::nullopt; + }; + + const auto zk = getZooKeeper(); - /// Two failure paths: catch block and the lambda function. The try-catch is for the case where it fails to schedule the export. - /// The lambda is for the case the export itself fails. + std::lock_guard lock(export_merge_tree_partition_mutex); + + for (const auto & [key, task_entry] : export_merge_tree_partition_task_entries) + { + /// this sounds impossible, but just in case + if (task_entry.parts_to_do == 0) + { + LOG_INFO(log, "Already completed, skipping"); + continue; + } + + const auto & manifest = task_entry.manifest; + const auto & database = getContext()->resolveDatabase(manifest.destination_database); + const auto & table = manifest.destination_table; + + const auto destination_storage_id = StorageID(QualifiedTableName {database, table}); + + const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, getContext()); + + if (!destination_storage) + { + LOG_INFO(log, "Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); + continue; + } + + const auto partition_path = fs::path(exports_path) / key; + const auto next_idx_path = fs::path(partition_path) / "next_idx"; + std::string next_idx_string; + if (!zk->tryGet(next_idx_path, next_idx_string)) + { + LOG_INFO(log, "Failed to get next_idx, skipping"); + continue; + } + + const auto next_idx = std::stoull(next_idx_string.c_str()); + + const auto part_to_export = try_to_acquire_a_part(manifest, partition_path, next_idx, zk); + + if (part_to_export.has_value()) + { try { - std::function callback = [this, partition_export, part_name, owner_path, part_path, parts_path, part_names, destination_storage_id, partition_export_path] - (MergeTreeExportManifest::CompletionCallbackResult result) + exportPartToTable( + part_to_export.value(), + destination_storage_id, + getContext(), + [this, exports_path, key, part_to_export, complete_part_export, partition_path, next_idx_path, next_idx] + (MergeTreeExportManifest::CompletionCallbackResult result) { - auto zk = getZooKeeper(); - if (!result.success) + + const auto zk_client = getZooKeeper(); + if (result.success) + { + complete_part_export( + partition_path, + fs::path(exports_path) / key / "parts" / part_to_export.value() / "status", + fs::path(exports_path) / key / "parts_to_do", + fs::path(exports_path) / key / "parts" / part_to_export.value() / "lock", + next_idx_path, + next_idx, + zk_client); + + /// maybe get up to date from complete_parts_export? + std::lock_guard inner_lock(export_merge_tree_partition_mutex); + export_merge_tree_partition_task_entries[key].parts_to_do--; + + if (export_merge_tree_partition_task_entries[key].parts_to_do == 0) + { + export_merge_tree_partition_task_entries.erase(key); + } + } + else { - LOG_INFO(log, "Failed to export part {}, releasing lock", part_name); + /// increment retry_count + /// if above threshhold, fail the entire export - hopefully it is safe to do so :D + /// I could also leave this for the cleanup thread, but will do it here for now. - const auto retry_count = std::stoi(zk->get(part_path / "retry_count")) + 1; - const auto max_retry_count = std::stoi(zk->get(part_path / "max_retry_count")); + bool erase_entry = false; + Coordination::Requests ops; - if (retry_count >= max_retry_count) + std::string retry_count_string; + if (zk_client->tryGet(fs::path(exports_path) / key / "parts" / part_to_export.value() / "retry_count", retry_count_string)) { - LOG_INFO(log, "Failed to export part {} after {} retries, setting status to FAILED", part_name, retry_count); - zk->set(partition_export_path / "status", "FAILED"); + std::size_t retry_count = std::stoull(retry_count_string.c_str()) + 1; + + //// todo arthur unhardcode this + if (retry_count > 3) + { + ops.emplace_back(zkutil::makeRemoveRequest(partition_path, -1)); + erase_entry = true; + } + else + { + ops.emplace_back(zkutil::makeSetRequest(fs::path(exports_path) / key / "parts" / part_to_export.value() / "retry_count", std::to_string(retry_count), -1)); + /// unlock the part + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(exports_path) / key / "parts" / part_to_export.value() / "lock", -1)); + } } else { - zk->set(part_path / "retry_count", std::to_string(retry_count)); + LOG_INFO(log, "Failed to get retry_count, will not try to update it"); + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(partition_path) / "parts" / part_to_export.value() / "lock", -1)); } - zk->remove(owner_path); - return; - } - - zk->create(part_path / "path", result.relative_path_in_destination_storage, zkutil::CreateMode::Persistent); - LOG_INFO(log, "Exported partition {} part {} to {}", partition_export, part_name, result.relative_path_in_destination_storage); - - /// loop over all parts again and check if they all have the path set, if so, mark the export as done - Strings exported_paths; - for (const auto & possibly_commited_part : part_names) - { - /// todo arthur is this a case of "try to write as opposed to try to read because it might be stale?" - const auto possibly_commited_part_path = parts_path / possibly_commited_part; - if (!zk->exists(possibly_commited_part_path / "path")) + Coordination::Responses responses; + if (zk_client->tryMulti(ops, responses) != Coordination::Error::ZOK) { - break; + LOG_INFO(log, "All failure mechanism failed, will not try to update it"); + return; } - exported_paths.push_back(zk->get(possibly_commited_part_path / "path")); - } - - bool exported_all_parts = exported_paths.size() == part_names.size(); - if (!exported_all_parts) - { - return; - } - const auto destination_storage = DatabaseCatalog::instance().tryGetTable( - StorageID(QualifiedTableName::parseFromString(destination_storage_id)), - getContext()); - const auto transaction_id = zk->get(partition_export_path / "transaction_id"); - /// possibly need to add a check here to see if the table still exists - destination_storage->commitExportPartitionTransaction(transaction_id, partition_export, exported_paths, getContext()); - zk->tryCreate(partition_export_path / "status", "COMPLETED", zkutil::CreateMode::Persistent); - }; - - exportPartToTable( - part_name, - StorageID(QualifiedTableName::parseFromString(destination_storage_id)), - getContext(), - callback); + if (erase_entry) + { + std::lock_guard inner_lock(export_merge_tree_partition_mutex); + export_merge_tree_partition_task_entries.erase(key); + return; + } - /// for now, export only one part at a time - break; + } + }); } catch (...) { + /// failed to schedule the part export tryLogCurrentException(log, __PRETTY_FUNCTION__); - zookeeper->remove(owner_path); - /// for now, export only one part at a time - break; + /// best-effort to remove the lock (actually, we should make sure the lock is released..) + zk->tryRemove(fs::path(partition_path) / "parts" / part_to_export.value() / "lock"); } + } } @@ -5896,6 +6177,8 @@ void StorageReplicatedMergeTree::partialShutdown() queue_updating_task->deactivate(); mutations_updating_task->deactivate(); mutations_finalizing_task->deactivate(); + export_merge_tree_partition_updating_task->deactivate(); + export_merge_tree_partition_select_task->deactivate(); cleanup_thread.stop(); async_block_ids_cache.stop(); @@ -8161,6 +8444,32 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & const auto exports_path = fs::path(zookeeper_path) / "exports"; Coordination::Requests ops; + /* + exports + partition_id + target storage id: (or may be hash is safer?) + metadata (znode) <- immutable, every replica read it once to get full meta, znode mtime - is a timestamp + parition id + destination id + source replica + number of parts: 100 + list of parts: <- processed in strict order + 2020_0_0_1 + 2020_1_1_1 + ... + parts_to_do: 100 (znode) + exceptions_per_replica (znode) + replica1: + num_exceptions: 1 + last_exception (znode, znode mtime - is a timestamp of last exception) + part: + exception + parts/ + part_name/ <-- the value of that znode is pending initially, and finished later. + lock = ephemeral, when processing + replica: r1 + start_time: 123445 + */ + /// maybe this should go in initialization somewhere else if (!zookeeper->exists(exports_path)) { @@ -8177,93 +8486,77 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & ops.emplace_back(zkutil::makeCreateRequest(partition_exports_path, "", zkutil::CreateMode::Persistent)); - const Strings parts = zookeeper->getChildren(fs::path(replica_path) / "parts"); - const ActiveDataPartSet active_parts_set(format_version, parts); - const auto part_infos = active_parts_set.getPartInfos(); - std::vector parts_to_export; - for (const auto & part_info : part_infos) + auto data_parts_lock = lockParts(); + + const auto parts = getDataPartsVectorInPartitionForInternalUsage(MergeTreeDataPartState::Active, partition_id, &data_parts_lock); + + // const Strings parts = zookeeper->getChildren(fs::path(replica_path) / "parts"); + // const ActiveDataPartSet active_parts_set(format_version, parts); + // const auto part_infos = active_parts_set.getPartInfos(); + // std::vector parts_to_export; + // for (const auto & part : parts) + // { + // if (part_info.getPartitionId() == partition_id) + // { + // parts_to_export.push_back(part_info.getPartNameV1()); + // } + // } + + if (parts.empty()) { - if (part_info.getPartitionId() == partition_id) - { - parts_to_export.push_back(part_info.getPartNameV1()); - } + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} doesn't exist", partition_id); } - if (parts_to_export.empty()) + std::vector part_names; + for (const auto & part : parts) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} doesn't exist", partition_id); + part_names.push_back(part->name); } - /// somehow check if the list of parts is updated "enough" + /// TODO arthur somehow check if the list of parts is updated "enough" - /* - table_path/exports/partition_id - table_path/exports/partition_id/transaction_id - table_path/exports/partition_id/destination_storage_id - table_path/exports/partition_id/status - table_path/exports/partition_id/create_time - table_path/exports/partition_id/time_to_live_seconds - table_path/exports/partition_id/part_name_1 - table_path/exports/partition_id/part_name_1/status - table_path/exports/partition_id/part_name_1/owner - table_path/exports/partition_id/part_name_1/retry_count - table_path/exports/partition_id/part_name_1/max_retry_count - table_path/exports/partition_id/part_name_1/path - table_path/exports/partition_id/part_name_n - ... - */ + ExportReplicatedMergeTreePartitionManifest manifest; - const auto transaction_id = std::to_string(generateSnowflakeID()); + manifest.transaction_id = std::to_string(generateSnowflakeID()); + manifest.partition_id = partition_id; + manifest.destination_database = command.to_database; + manifest.destination_table = command.to_table; + manifest.source_replica = replica_name; + manifest.number_of_parts = part_names.size(); + manifest.parts = part_names; + manifest.create_time = time(nullptr); - const auto parts_path = fs::path(partition_exports_path) / "parts"; ops.emplace_back(zkutil::makeCreateRequest( - parts_path, "", + fs::path(partition_exports_path) / "metadata.json", + manifest.toJsonString(), zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest( - fs::path(partition_exports_path) / "transaction_id", - transaction_id, + fs::path(partition_exports_path) / "parts_to_do", + std::to_string(part_names.size()), zkutil::CreateMode::Persistent)); - + ops.emplace_back(zkutil::makeCreateRequest( - fs::path(partition_exports_path) / "destination_storage_id", - dest_storage->getStorageID().getFullTableName(), + fs::path(partition_exports_path) / "next_idx", + "0", zkutil::CreateMode::Persistent)); - /// status: IN_PROGRESS, COMPLETED, FAILED ops.emplace_back(zkutil::makeCreateRequest( - fs::path(partition_exports_path) / "status", - "IN_PROGRESS", + fs::path(partition_exports_path) / "exceptions_per_replica", + "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest( - fs::path(partition_exports_path) / "create_time", - std::to_string(time(nullptr)), + fs::path(partition_exports_path) / "parts", + "", zkutil::CreateMode::Persistent)); - + /// status: IN_PROGRESS, COMPLETED, FAILED ops.emplace_back(zkutil::makeCreateRequest( - fs::path(partition_exports_path) / "time_to_live_seconds", - std::to_string(3600), + fs::path(partition_exports_path) / "status", + "PENDING", zkutil::CreateMode::Persistent)); - for (const String & part_name : parts_to_export) - { - const String part_path = fs::path(parts_path) / part_name; - - ops.emplace_back(zkutil::makeCreateRequest(part_path, "", zkutil::CreateMode::Persistent)); - - ops.emplace_back(zkutil::makeCreateRequest( - fs::path(part_path) / "retry_count", - "0", - zkutil::CreateMode::Persistent)); - - ops.emplace_back(zkutil::makeCreateRequest( - fs::path(part_path) / "max_retry_count", - "3", - zkutil::CreateMode::Persistent)); - } - Coordination::Responses responses; Coordination::Error code = zookeeper->tryMulti(ops, responses); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index f730ff29098a..715f32ea28dd 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -506,8 +507,16 @@ class StorageReplicatedMergeTree final : public MergeTreeData /// A task that marks finished mutations as done. BackgroundSchedulePoolTaskHolder mutations_finalizing_task; + BackgroundSchedulePoolTaskHolder export_merge_tree_partition_updating_task; + + Coordination::WatchCallbackPtr export_merge_tree_partition_watch_callback; + + std::mutex export_merge_tree_partition_mutex; + BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; + + std::unordered_map export_merge_tree_partition_task_entries; /// A thread that removes old parts, log entries, and blocks. ReplicatedMergeTreeCleanupThread cleanup_thread; @@ -739,6 +748,9 @@ class StorageReplicatedMergeTree final : public MergeTreeData void selectPartsToExport(); + /// update in-memory list of partition exports + void exportMergeTreePartitionUpdatingTask(); + /** Write the selected parts to merge into the log, * Call when merge_selecting_mutex is locked. * Returns false if any part is not in ZK. From 4b8632ed96aa90e6e4aff99da333c99024e41397 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 10 Oct 2025 07:19:24 -0300 Subject: [PATCH 03/17] able to export partition using two different replicas and upload commit file --- src/Storages/StorageReplicatedMergeTree.cpp | 79 +++++++++++++++++---- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index f8482ecfe4a0..73c7f86076d9 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -115,6 +115,7 @@ #include #include "Interpreters/StorageID.h" +#include "QueryPipeline/QueryPlanResourceHolder.h" #include "Storages/ExportReplicatedMergeTreePartitionManifest.h" #include "Storages/ExportReplicatedMergeTreePartitionTaskEntry.h" #include @@ -4465,11 +4466,16 @@ void StorageReplicatedMergeTree::selectPartsToExport() auto complete_part_export = [&]( const std::string & export_partition_path, + const std::string & part_path, const std::string & part_status_path, const std::string & parts_to_do_path, const std::string & lock_path, const std::string & next_idx_path, const std::size_t next_idx_local, + const std::string & path_in_destination_storage_path, + const StoragePtr & destination_storage, + const std::string & transaction_id, + const std::string & partition_id, const ZooKeeperPtr & zk) -> bool { /// todo arthur is it possible to grab stats using a multi-op? @@ -4529,9 +4535,38 @@ void StorageReplicatedMergeTree::selectPartsToExport() ops.emplace_back(zkutil::makeSetRequest(part_status_path, "COMPLETED", -1)); ops.emplace_back(zkutil::makeRemoveRequest(lock_path, lock_stat.version)); ops.emplace_back(zkutil::makeSetRequest(parts_to_do_path, std::to_string(parts_to_do), parts_to_do_stat.version)); + ops.emplace_back(zkutil::makeCreateRequest(fs::path(part_path) / "finished_by", replica_name, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(fs::path(part_path) / "path_in_destination_storage", path_in_destination_storage_path, zkutil::CreateMode::Persistent)); if (parts_to_do == 0) { + /// loop over all parts under `/parts` and grab all paths in destination storage + Strings exported_paths; + const auto parts_path = fs::path(export_partition_path) / "parts"; + Strings parts = zk->getChildren(parts_path); + + for (const auto & part : parts) + { + std::string path_in_destination_storage; + const auto path_in_destination_storage_zk_path = fs::path(parts_path) / part / "path_in_destination_storage"; + + if (zk->tryGet(path_in_destination_storage_zk_path, path_in_destination_storage)) + { + exported_paths.push_back(path_in_destination_storage); + } + else + { + /// todo arthur what should I do here? + LOG_WARNING(log, "Failed to get path_in_destination_storage for part {} in export", part); + } + } + + LOG_INFO(log, "Collected {} exported paths for export", exported_paths.size()); + + /// manually add the export we just finished because it is not zk yet + exported_paths.push_back(path_in_destination_storage_path); + + destination_storage->commitExportPartitionTransaction(transaction_id, partition_id, exported_paths, getContext()); ops.emplace_back(zkutil::makeSetRequest(fs::path(export_partition_path) / "status", "COMPLETED", -1)); } @@ -4669,14 +4704,30 @@ void StorageReplicatedMergeTree::selectPartsToExport() std::lock_guard lock(export_merge_tree_partition_mutex); - for (const auto & [key, task_entry] : export_merge_tree_partition_task_entries) + for (auto & [key, task_entry] : export_merge_tree_partition_task_entries) { /// this sounds impossible, but just in case if (task_entry.parts_to_do == 0) { LOG_INFO(log, "Already completed, skipping"); continue; - } + } + + std::string parts_to_do_string; + if (!zk->tryGet(fs::path(exports_path) / key / "parts_to_do", parts_to_do_string)) + { + LOG_INFO(log, "Failed to get parts_to_do, skipping"); + continue; + } + + const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); + task_entry.parts_to_do = parts_to_do; + + if (task_entry.parts_to_do == 0) + { + LOG_INFO(log, "Already completed, skipping"); + continue; + } const auto & manifest = task_entry.manifest; const auto & database = getContext()->resolveDatabase(manifest.destination_database); @@ -4705,6 +4756,9 @@ void StorageReplicatedMergeTree::selectPartsToExport() const auto part_to_export = try_to_acquire_a_part(manifest, partition_path, next_idx, zk); + const auto partition_id = manifest.partition_id; + const auto transaction_id = manifest.transaction_id; + if (part_to_export.has_value()) { try @@ -4713,20 +4767,26 @@ void StorageReplicatedMergeTree::selectPartsToExport() part_to_export.value(), destination_storage_id, getContext(), - [this, exports_path, key, part_to_export, complete_part_export, partition_path, next_idx_path, next_idx] + [this, partition_id, transaction_id, exports_path, key, part_to_export, complete_part_export, partition_path, next_idx_path, next_idx, destination_storage] (MergeTreeExportManifest::CompletionCallbackResult result) { const auto zk_client = getZooKeeper(); if (result.success) { + complete_part_export( partition_path, + fs::path(exports_path) / key / "parts" / part_to_export.value(), fs::path(exports_path) / key / "parts" / part_to_export.value() / "status", fs::path(exports_path) / key / "parts_to_do", fs::path(exports_path) / key / "parts" / part_to_export.value() / "lock", next_idx_path, next_idx, + result.relative_path_in_destination_storage, + destination_storage, + transaction_id, + partition_id, zk_client); /// maybe get up to date from complete_parts_export? @@ -4734,7 +4794,7 @@ void StorageReplicatedMergeTree::selectPartsToExport() export_merge_tree_partition_task_entries[key].parts_to_do--; if (export_merge_tree_partition_task_entries[key].parts_to_do == 0) - { + { export_merge_tree_partition_task_entries.erase(key); } } @@ -4744,7 +4804,6 @@ void StorageReplicatedMergeTree::selectPartsToExport() /// if above threshhold, fail the entire export - hopefully it is safe to do so :D /// I could also leave this for the cleanup thread, but will do it here for now. - bool erase_entry = false; Coordination::Requests ops; std::string retry_count_string; @@ -4755,8 +4814,7 @@ void StorageReplicatedMergeTree::selectPartsToExport() //// todo arthur unhardcode this if (retry_count > 3) { - ops.emplace_back(zkutil::makeRemoveRequest(partition_path, -1)); - erase_entry = true; + ops.emplace_back(zkutil::makeRemoveRecursiveRequest(partition_path, 1000)); } else { @@ -4778,13 +4836,6 @@ void StorageReplicatedMergeTree::selectPartsToExport() return; } - if (erase_entry) - { - std::lock_guard inner_lock(export_merge_tree_partition_mutex); - export_merge_tree_partition_task_entries.erase(key); - return; - } - } }); } From c067926a96c466cdbc16830c3556beaed1e85a26 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 13 Oct 2025 08:03:53 -0300 Subject: [PATCH 04/17] checkpoint --- src/Storages/StorageReplicatedMergeTree.cpp | 166 ++++++++++---------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 73c7f86076d9..6d135c9d0a37 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4345,119 +4345,119 @@ void StorageReplicatedMergeTree::mutationsFinalizingTask() } } -// static void exportPartitionCallback( -// MergeTreeExportManifest::CompletionCallbackResult result, -// zkutil::ZooKeeperPtr & zookeeper, -// const String & partition_export, -// const String & part_name) -// { -// if (!result.success) -// { -// LOG_INFO(log, "Failed to export partition {} part {}, releasing lock", partition_export, part_name); -// zookeeper->remove(owner_path); -// return; -// } - -// zookeeper->create(part_path / "path", result.relative_path_in_destination_storage, zkutil::CreateMode::Persistent); -// LOG_INFO(log, "Exported partition {} part {} to {}", partition_export, part_name, result.relative_path_in_destination_storage); - -// /// loop over all parts again and check if they all have the path set, if so, mark the export as done -// bool completed = true; -// Strings exported_paths; -// for (const auto & possibly_commited_part : part_names) -// { -// /// todo arthur is this a case of "try to write as opposed to try to read because it might be stale?" -// const auto possibly_commited_part_path = parts_path / possibly_commited_part; -// if (!zookeeper->exists(possibly_commited_part_path / "path")) -// { -// completed = false; -// break; -// } -// exported_paths.push_back(zookeeper->get(possibly_commited_part_path / "path")); -// } - -// if (completed) -// { - -// const auto destination_storage = DatabaseCatalog::instance().tryGetTable( -// StorageID(QualifiedTableName::parseFromString(destination_storage_id)), -// getContext()); -// const auto transaction_id = zookeeper->get(partition_export_path / "transaction_id"); -// /// possibly need to add a check here to see if the table still exists -// destination_storage->commitExportPartitionTransaction(transaction_id, partition_export, exported_paths, getContext()); -// zookeeper->set(partition_export_path / "status", "COMPLETED"); -// } -// } - void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() { std::lock_guard lock(export_merge_tree_partition_mutex); auto zk = getZooKeeper(); - const auto exports_path = fs::path(zookeeper_path) / "exports"; - /// why do I need the stat? - Coordination::Stat stat; - const auto partition_exports_in_zk = zk->getChildrenWatch(exports_path, &stat, export_merge_tree_partition_watch_callback); - const auto partition_exports_in_zk_set = std::unordered_set(partition_exports_in_zk.begin(), partition_exports_in_zk.end()); + const std::string exports_path = fs::path(zookeeper_path) / "exports"; + const std::string cleanup_lock_path = fs::path(zookeeper_path) / "exports_cleanup_lock"; - std::unordered_set local_export_partition_entries; - for (const auto & entry : export_merge_tree_partition_task_entries) - local_export_partition_entries.insert(entry.first); + bool cleanup_lock_acquired = zk->tryCreate(cleanup_lock_path, "", zkutil::CreateMode::Ephemeral) == Coordination::Error::ZOK; - std::unordered_set new_entries; - std::unordered_set removed_entries; - - for (const auto & partition_export_task_entry : local_export_partition_entries) + if (cleanup_lock_acquired) { - if (!partition_exports_in_zk_set.contains(partition_export_task_entry)) - removed_entries.insert(partition_export_task_entry); + LOG_INFO(log, "Cleanup lock acquired, will remove stale entries"); } - for (const auto & partition_export : partition_exports_in_zk_set) - { - if (!local_export_partition_entries.contains(partition_export)) - new_entries.insert(partition_export); - } + Coordination::Stat stat; + const auto children = zk->getChildrenWatch(exports_path, &stat, export_merge_tree_partition_watch_callback); + const std::unordered_set zk_children(children.begin(), children.end()); - /// remove the removed entries from the local set - for (const auto & entry : removed_entries) - export_merge_tree_partition_task_entries.erase(entry); + const auto now = time(nullptr); - // add the new entries to the local set - for (const auto & entry_key : new_entries) + /// Load new entries + /// If we have the cleanup lock, also remove stale entries from zk and local + for (const auto & key : zk_children) { - // get entry from zk - const auto entry_path = fs::path(exports_path) / entry_key; - const auto metadata_entry_path = fs::path(entry_path) / "metadata.json"; - const auto parts_to_do_path = fs::path(entry_path) / "parts_to_do"; + if (!cleanup_lock_acquired && export_merge_tree_partition_task_entries.contains(key)) + continue; - std::string parts_to_do_string; - if (!zk->tryGet(parts_to_do_path, parts_to_do_string)) + const std::string entry_path = fs::path(exports_path) / key; + + std::string metadata_json; + if (!zk->tryGet(fs::path(entry_path) / "metadata.json", metadata_json)) { - LOG_INFO(log, "Skipping..."); + LOG_INFO(log, "Skipping {}: missing metadata.json", key); continue; } - const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); + std::string status; + if (!zk->tryGet(fs::path(entry_path) / "status", status)) + { + LOG_INFO(log, "Skipping {}: missing status", key); + continue; + } - if (parts_to_do == 0) + bool is_not_pending = status != "PENDING"; + + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + if (cleanup_lock_acquired) + { + bool has_expired = metadata.create_time < now - 45; + + if (has_expired && is_not_pending) + { + zk->tryRemoveRecursive(fs::path(entry_path)); + export_merge_tree_partition_task_entries.erase(key); + LOG_INFO(log, "Removed {}: expired", key); + continue; + } + } + + if (is_not_pending) { - LOG_INFO(log, "Skipping... Parts to do is 0, will not load it"); + LOG_INFO(log, "Skipping {}: status is not PENDING", key); continue; } - std::string metadata_json; - if (!zk->tryGet(metadata_entry_path, metadata_json)) + if (export_merge_tree_partition_task_entries.contains(key)) { - LOG_INFO(log, "Skipping..."); + LOG_INFO(log, "Skipping {}: already exists", key); continue; } - const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + std::string parts_to_do_str; + if (!zk->tryGet(fs::path(entry_path) / "parts_to_do", parts_to_do_str)) + { + LOG_INFO(log, "Skipping {}: no parts_to_do", key); + continue; + } - export_merge_tree_partition_task_entries[entry_key] = ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do}; + uint64_t parts_to_do = 0; + try + { + parts_to_do = std::stoull(parts_to_do_str); + } + catch (...) + { + LOG_WARNING(log, "Skipping {}: invalid parts_to_do='{}'", key, parts_to_do_str); + continue; + } + + if (parts_to_do == 0) + { + LOG_INFO(log, "Skipping {}: parts_to_do is 0", key); + continue; + } + + export_merge_tree_partition_task_entries.emplace( + key, + ExportReplicatedMergeTreePartitionTaskEntry{std::move(metadata), parts_to_do}); + } + + /// Remove entries that were deleted by someone else + std::erase_if(export_merge_tree_partition_task_entries, + [&](auto const & kv) { return !zk_children.contains(kv.first); }); + + if (cleanup_lock_acquired) + { + zk->tryRemove(cleanup_lock_path); } + + export_merge_tree_partition_updating_task->scheduleAfter(30 * 1000); } void StorageReplicatedMergeTree::selectPartsToExport() From dad9464dd2ff96112295d7da59e77cfc36914e49 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 13 Oct 2025 09:16:34 -0300 Subject: [PATCH 05/17] some changes --- src/Storages/StorageReplicatedMergeTree.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 6d135c9d0a37..0bd36a7a5a8c 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4838,6 +4838,9 @@ void StorageReplicatedMergeTree::selectPartsToExport() } }); + + /// managed to schedule a task, re-run immediately to pick up more tasks if possible + export_merge_tree_partition_select_task->schedule(); } catch (...) { @@ -4846,12 +4849,15 @@ void StorageReplicatedMergeTree::selectPartsToExport() /// best-effort to remove the lock (actually, we should make sure the lock is released..) zk->tryRemove(fs::path(partition_path) / "parts" / part_to_export.value() / "lock"); + + /// re-run after some time + export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); } } } - export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); + // export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); } @@ -8463,8 +8469,9 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & "Exporting merge tree part is experimental. Set `allow_experimental_export_merge_tree_part` to enable it"); } - String dest_database = query_context->resolveDatabase(command.to_database); - auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, command.to_table}, query_context); + const auto dest_database = query_context->resolveDatabase(command.to_database); + const auto dest_table = command.to_table; + auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, dest_table}, query_context); if (dest_storage->getStorageID() == this->getStorageID()) { @@ -8570,8 +8577,8 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & manifest.transaction_id = std::to_string(generateSnowflakeID()); manifest.partition_id = partition_id; - manifest.destination_database = command.to_database; - manifest.destination_table = command.to_table; + manifest.destination_database = dest_database; + manifest.destination_table = dest_table; manifest.source_replica = replica_name; manifest.number_of_parts = part_names.size(); manifest.parts = part_names; From fa02f306b2f033817635e6de3f51b1cd5f2628f6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 13 Oct 2025 10:27:05 -0300 Subject: [PATCH 06/17] add a silly test --- src/Storages/StorageReplicatedMergeTree.cpp | 4 +- ...3604_export_merge_tree_partition.reference | 31 +++++++++++ .../03604_export_merge_tree_partition.sh | 54 +++++++++++++++++++ 3 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03604_export_merge_tree_partition.reference create mode 100755 tests/queries/0_stateless/03604_export_merge_tree_partition.sh diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 0bd36a7a5a8c..7c4acac936a6 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4458,6 +4458,7 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() } export_merge_tree_partition_updating_task->scheduleAfter(30 * 1000); + export_merge_tree_partition_select_task->schedule(); } void StorageReplicatedMergeTree::selectPartsToExport() @@ -4838,9 +4839,6 @@ void StorageReplicatedMergeTree::selectPartsToExport() } }); - - /// managed to schedule a task, re-run immediately to pick up more tasks if possible - export_merge_tree_partition_select_task->schedule(); } catch (...) { diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.reference b/tests/queries/0_stateless/03604_export_merge_tree_partition.reference new file mode 100644 index 000000000000..97e17ae5aeec --- /dev/null +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.reference @@ -0,0 +1,31 @@ + +Select from source table +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +6 2022 +7 2022 +Select from destination table +1 2020 +2 2020 +3 2020 +4 2021 +Export partition 2022 +Select from destination table again +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +6 2022 +7 2022 +---- Data in roundtrip ReplicatedMergeTree table (should match s3_table) +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +6 2022 +7 2022 diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.sh b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh new file mode 100755 index 000000000000..1198281afc72 --- /dev/null +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Tags: replica, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +rmt_table="rmt_table_${RANDOM}" +s3_table="s3_table_${RANDOM}" +rmt_table_roundtrip="rmt_table_roundtrip_${RANDOM}" + +query() { + $CLICKHOUSE_CLIENT --query "$1" +} + +query "DROP TABLE IF EXISTS $rmt_table, $s3_table, $rmt_table_roundtrip" + +query "CREATE TABLE $rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/$rmt_table', 'replica1') PARTITION BY year ORDER BY tuple()" +query "CREATE TABLE $s3_table (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='$s3_table', format=Parquet, partition_strategy='hive') PARTITION BY year" + +query "INSERT INTO $rmt_table VALUES (1, 2020), (2, 2020), (4, 2021)" + +query "INSERT INTO $rmt_table VALUES (3, 2020), (5, 2021)" + +query "INSERT INTO $rmt_table VALUES (6, 2022), (7, 2022)" + +query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2020' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2021' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +# todo poll some kind of status +sleep 15 + +echo "Select from source table" +query "SELECT * FROM $rmt_table ORDER BY id" + +echo "Select from destination table" +query "SELECT * FROM $s3_table ORDER BY id" + +echo "Export partition 2022" +query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2022' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +# todo poll some kind of status +sleep 5 + +echo "Select from destination table again" +query "SELECT * FROM $s3_table ORDER BY id" + +query "CREATE TABLE $rmt_table_roundtrip ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/$rmt_table_roundtrip', 'replica1') PARTITION BY year ORDER BY tuple() AS SELECT * FROM $s3_table" + +echo "---- Data in roundtrip ReplicatedMergeTree table (should match s3_table)" +query "SELECT * FROM $rmt_table_roundtrip ORDER BY id" + +query "DROP TABLE IF EXISTS $rmt_table, $s3_table, $rmt_table_roundtrip" \ No newline at end of file From ed9f636bd4b00b3acfe76e0d854630bf2cd5aa34 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 13 Oct 2025 11:20:32 -0300 Subject: [PATCH 07/17] hold parts references to prevent deletion --- .../ExportReplicatedMergeTreePartitionTaskEntry.h | 8 ++++++++ src/Storages/StorageReplicatedMergeTree.cpp | 12 +++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h b/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h index 122c29b1b244..bdaef2ec3fa7 100644 --- a/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h +++ b/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h @@ -1,13 +1,21 @@ #pragma once #include +#include namespace DB { struct ExportReplicatedMergeTreePartitionTaskEntry { + using DataPartPtr = std::shared_ptr; ExportReplicatedMergeTreePartitionManifest manifest; std::size_t parts_to_do; + /// References to the parts that should be exported + /// This is used to prevent the parts from being deleted before finishing the export operation + /// It does not mean this replica will export all the parts + /// There is also a chance this replica does not contain a given part and it is totally ok. + std::vector part_references; }; + } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 7c4acac936a6..29f1ae868d34 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4443,9 +4443,19 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() continue; } + std::vector part_references; + + for (const auto & part_name : metadata.parts) + { + if (const auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated})) + { + part_references.push_back(part); + } + } + export_merge_tree_partition_task_entries.emplace( key, - ExportReplicatedMergeTreePartitionTaskEntry{std::move(metadata), parts_to_do}); + ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do, std::move(part_references)}); } /// Remove entries that were deleted by someone else From 49e9a98cc2e149ec8f2be515e97f7fd05e9d041a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 13 Oct 2025 13:51:49 -0300 Subject: [PATCH 08/17] fix a few tests --- src/Storages/MergeTree/MergeTreeData.cpp | 6 ++++-- tests/queries/0_stateless/01271_show_privileges.reference | 1 + .../02221_system_zookeeper_unrestricted.reference | 2 ++ .../02221_system_zookeeper_unrestricted_like.reference | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8e4ecd57127c..a37759c43e37 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5906,9 +5906,11 @@ void MergeTreeData::exportPartToTable(const PartitionCommand & command, ContextP "Exporting merge tree part is experimental. Set `allow_experimental_export_merge_tree_part` to enable it"); } - auto part_name = command.partition->as().value.safeGet(); + const auto part_name = command.partition->as().value.safeGet(); - exportPartToTable(part_name, StorageID{command.to_database, command.to_table}, query_context); + const auto database_name = query_context->resolveDatabase(command.to_database); + + exportPartToTable(part_name, StorageID{database_name, command.to_table}, query_context); } void MergeTreeData::exportPartToTable( diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 3cd42125ace0..56533c8f0e49 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -44,6 +44,7 @@ ALTER MATERIALIZE TTL ['MATERIALIZE TTL'] TABLE ALTER TABLE ALTER SETTINGS ['ALTER SETTING','ALTER MODIFY SETTING','MODIFY SETTING','RESET SETTING'] TABLE ALTER TABLE ALTER MOVE PARTITION ['ALTER MOVE PART','MOVE PARTITION','MOVE PART'] TABLE ALTER TABLE ALTER EXPORT PART ['ALTER EXPORT PART','EXPORT PART'] TABLE ALTER TABLE +ALTER EXPORT PARTITION ['ALTER EXPORT PARTITION','EXPORT PARTITION'] TABLE ALTER TABLE ALTER FETCH PARTITION ['ALTER FETCH PART','FETCH PARTITION'] TABLE ALTER TABLE ALTER FREEZE PARTITION ['FREEZE PARTITION','UNFREEZE'] TABLE ALTER TABLE ALTER UNLOCK SNAPSHOT ['UNLOCK SNAPSHOT'] TABLE ALTER TABLE diff --git a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference index 137fb0587cc6..0c4e18548017 100644 --- a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference +++ b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference @@ -18,6 +18,8 @@ columns columns creator_info creator_info +exports +exports failed_parts failed_parts flags diff --git a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference index 2893c2a845fd..fe4cbcad7b5d 100644 --- a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference +++ b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference @@ -51,6 +51,7 @@ blocks columns columns creator_info +exports failed_parts flags host From c88abfe1e5cb5d0e9774090b90719064d1f27126 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 14 Oct 2025 14:59:12 -0300 Subject: [PATCH 09/17] try to fix integ test failure and fix failure handling --- src/Storages/MergeTree/MergeTreeData.cpp | 15 +- .../MergeTree/MergeTreeExportManifest.h | 17 +- src/Storages/StorageReplicatedMergeTree.cpp | 569 ++++++++++-------- 3 files changed, 339 insertions(+), 262 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a37759c43e37..e2ae830544fc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6027,7 +6027,7 @@ void MergeTreeData::exportPartToTableImpl( export_manifests.erase(manifest); if (manifest.completion_callback) - manifest.completion_callback({}); + manifest.completion_callback(MergeTreeExportManifest::CompletionCallbackResult::createFailure(e.message())); return; } @@ -6101,6 +6101,13 @@ void MergeTreeData::exportPartToTableImpl( CompletedPipelineExecutor exec(pipeline); exec.execute(); + volatile bool x = true; + + if (x) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Some issue"); + } + std::lock_guard inner_lock(export_manifests_mutex); writePartLog( PartLogElement::Type::EXPORT_PART, @@ -6119,9 +6126,9 @@ void MergeTreeData::exportPartToTableImpl( ProfileEvents::increment(ProfileEvents::PartsExportTotalMilliseconds, static_cast((*exports_list_entry)->elapsed * 1000)); if (manifest.completion_callback) - manifest.completion_callback({true, destination_file_path}); + manifest.completion_callback(MergeTreeExportManifest::CompletionCallbackResult::createSuccess(destination_file_path)); } - catch (...) + catch (const Exception & e) { tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while exporting the part {}. User should retry.", manifest.data_part->name)); @@ -6142,7 +6149,7 @@ void MergeTreeData::exportPartToTableImpl( export_manifests.erase(manifest); if (manifest.completion_callback) - manifest.completion_callback({}); + manifest.completion_callback(MergeTreeExportManifest::CompletionCallbackResult::createFailure(e.message())); throw; } diff --git a/src/Storages/MergeTree/MergeTreeExportManifest.h b/src/Storages/MergeTree/MergeTreeExportManifest.h index 0ad93b41798b..bcc480c8507e 100644 --- a/src/Storages/MergeTree/MergeTreeExportManifest.h +++ b/src/Storages/MergeTree/MergeTreeExportManifest.h @@ -10,11 +10,26 @@ struct MergeTreeExportManifest struct CompletionCallbackResult { + private: + CompletionCallbackResult(bool success_, const String & relative_path_in_destination_storage_, const String & exception_) + : success(success_), relative_path_in_destination_storage(relative_path_in_destination_storage_), exception(exception_) {} + public: + + static CompletionCallbackResult createSuccess(const String & relative_path_in_destination_storage_) + { + return CompletionCallbackResult(true, relative_path_in_destination_storage_, ""); + } + + static CompletionCallbackResult createFailure(const String & exception_) + { + return CompletionCallbackResult(false, "", exception_); + } + bool success = false; String relative_path_in_destination_storage; + String exception; }; - MergeTreeExportManifest( const StorageID & destination_storage_id_, const DataPartPtr & data_part_, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 29f1ae868d34..e9da32e4d37b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1566,7 +1566,7 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper /// NOTE /block_numbers/ actually is not flat, because /block_numbers// may have ephemeral children, /// but we assume that all ephemeral block locks are already removed when table is being dropped. - static constexpr std::array flat_nodes = {"block_numbers", "blocks", "async_blocks", "leader_election", "log", "mutations", "pinned_part_uuids"}; + static constexpr std::array flat_nodes = {"block_numbers", "blocks", "async_blocks", "leader_election", "log", "mutations", "pinned_part_uuids", "exports_cleanup_lock"}; /// First try to remove paths that are known to be flat for (const auto * node : flat_nodes) @@ -4347,128 +4347,137 @@ void StorageReplicatedMergeTree::mutationsFinalizingTask() void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() { - std::lock_guard lock(export_merge_tree_partition_mutex); - - auto zk = getZooKeeper(); - - const std::string exports_path = fs::path(zookeeper_path) / "exports"; - const std::string cleanup_lock_path = fs::path(zookeeper_path) / "exports_cleanup_lock"; - - bool cleanup_lock_acquired = zk->tryCreate(cleanup_lock_path, "", zkutil::CreateMode::Ephemeral) == Coordination::Error::ZOK; - - if (cleanup_lock_acquired) - { - LOG_INFO(log, "Cleanup lock acquired, will remove stale entries"); - } - - Coordination::Stat stat; - const auto children = zk->getChildrenWatch(exports_path, &stat, export_merge_tree_partition_watch_callback); - const std::unordered_set zk_children(children.begin(), children.end()); - - const auto now = time(nullptr); - - /// Load new entries - /// If we have the cleanup lock, also remove stale entries from zk and local - for (const auto & key : zk_children) + try { - if (!cleanup_lock_acquired && export_merge_tree_partition_task_entries.contains(key)) - continue; - - const std::string entry_path = fs::path(exports_path) / key; - - std::string metadata_json; - if (!zk->tryGet(fs::path(entry_path) / "metadata.json", metadata_json)) - { - LOG_INFO(log, "Skipping {}: missing metadata.json", key); - continue; - } + std::lock_guard lock(export_merge_tree_partition_mutex); - std::string status; - if (!zk->tryGet(fs::path(entry_path) / "status", status)) + auto zk = getZooKeeper(); + + const std::string exports_path = fs::path(zookeeper_path) / "exports"; + const std::string cleanup_lock_path = fs::path(zookeeper_path) / "exports_cleanup_lock"; + + bool cleanup_lock_acquired = zk->tryCreate(cleanup_lock_path, "", zkutil::CreateMode::Ephemeral) == Coordination::Error::ZOK; + + if (cleanup_lock_acquired) { - LOG_INFO(log, "Skipping {}: missing status", key); - continue; + LOG_INFO(log, "Cleanup lock acquired, will remove stale entries"); } - - bool is_not_pending = status != "PENDING"; - - const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); - - if (cleanup_lock_acquired) + + Coordination::Stat stat; + const auto children = zk->getChildrenWatch(exports_path, &stat, export_merge_tree_partition_watch_callback); + const std::unordered_set zk_children(children.begin(), children.end()); + + const auto now = time(nullptr); + + /// Load new entries + /// If we have the cleanup lock, also remove stale entries from zk and local + for (const auto & key : zk_children) { - bool has_expired = metadata.create_time < now - 45; - - if (has_expired && is_not_pending) + if (!cleanup_lock_acquired && export_merge_tree_partition_task_entries.contains(key)) + continue; + + const std::string entry_path = fs::path(exports_path) / key; + + std::string metadata_json; + if (!zk->tryGet(fs::path(entry_path) / "metadata.json", metadata_json)) { - zk->tryRemoveRecursive(fs::path(entry_path)); - export_merge_tree_partition_task_entries.erase(key); - LOG_INFO(log, "Removed {}: expired", key); + LOG_INFO(log, "Skipping {}: missing metadata.json", key); continue; } - } - - if (is_not_pending) - { - LOG_INFO(log, "Skipping {}: status is not PENDING", key); - continue; - } - - if (export_merge_tree_partition_task_entries.contains(key)) - { - LOG_INFO(log, "Skipping {}: already exists", key); - continue; - } - - std::string parts_to_do_str; - if (!zk->tryGet(fs::path(entry_path) / "parts_to_do", parts_to_do_str)) - { - LOG_INFO(log, "Skipping {}: no parts_to_do", key); - continue; - } - - uint64_t parts_to_do = 0; - try - { - parts_to_do = std::stoull(parts_to_do_str); - } - catch (...) - { - LOG_WARNING(log, "Skipping {}: invalid parts_to_do='{}'", key, parts_to_do_str); - continue; - } - - if (parts_to_do == 0) - { - LOG_INFO(log, "Skipping {}: parts_to_do is 0", key); - continue; - } - - std::vector part_references; - - for (const auto & part_name : metadata.parts) - { - if (const auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated})) + + std::string status; + if (!zk->tryGet(fs::path(entry_path) / "status", status)) + { + LOG_INFO(log, "Skipping {}: missing status", key); + continue; + } + + bool is_not_pending = status != "PENDING"; + + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + if (cleanup_lock_acquired) + { + bool has_expired = metadata.create_time < now - 45; + + if (has_expired && is_not_pending) + { + zk->tryRemoveRecursive(fs::path(entry_path)); + export_merge_tree_partition_task_entries.erase(key); + LOG_INFO(log, "Removed {}: expired", key); + continue; + } + } + + if (is_not_pending) + { + LOG_INFO(log, "Skipping {}: status is not PENDING", key); + continue; + } + + if (export_merge_tree_partition_task_entries.contains(key)) + { + LOG_INFO(log, "Skipping {}: already exists", key); + continue; + } + + std::string parts_to_do_str; + if (!zk->tryGet(fs::path(entry_path) / "parts_to_do", parts_to_do_str)) + { + LOG_INFO(log, "Skipping {}: no parts_to_do", key); + continue; + } + + uint64_t parts_to_do = 0; + try + { + parts_to_do = std::stoull(parts_to_do_str); + } + catch (...) { - part_references.push_back(part); + LOG_WARNING(log, "Skipping {}: invalid parts_to_do='{}'", key, parts_to_do_str); + continue; } + + if (parts_to_do == 0) + { + LOG_INFO(log, "Skipping {}: parts_to_do is 0", key); + continue; + } + + std::vector part_references; + + for (const auto & part_name : metadata.parts) + { + if (const auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated})) + { + part_references.push_back(part); + } + } + + export_merge_tree_partition_task_entries.emplace( + key, + ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do, std::move(part_references)}); + } + + /// Remove entries that were deleted by someone else + std::erase_if(export_merge_tree_partition_task_entries, + [&](auto const & kv) { return !zk_children.contains(kv.first); }); + + if (cleanup_lock_acquired) + { + zk->tryRemove(cleanup_lock_path); } - export_merge_tree_partition_task_entries.emplace( - key, - ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do, std::move(part_references)}); + export_merge_tree_partition_select_task->schedule(); } - - /// Remove entries that were deleted by someone else - std::erase_if(export_merge_tree_partition_task_entries, - [&](auto const & kv) { return !zk_children.contains(kv.first); }); - - if (cleanup_lock_acquired) + catch (...) { - zk->tryRemove(cleanup_lock_path); + tryLogCurrentException(log, __PRETTY_FUNCTION__); } + export_merge_tree_partition_updating_task->scheduleAfter(30 * 1000); - export_merge_tree_partition_select_task->schedule(); } void StorageReplicatedMergeTree::selectPartsToExport() @@ -4594,7 +4603,6 @@ void StorageReplicatedMergeTree::selectPartsToExport() }; auto lock_part = [&]( - const std::string & export_partition_path, const std::string & part_path, const std::string & status_path, const std::string & lock_path, @@ -4620,20 +4628,20 @@ void StorageReplicatedMergeTree::selectPartsToExport() return false; } - std::string parts_to_do_zk; - if (!zk->tryGet(fs::path(export_partition_path) / "parts_to_do", parts_to_do_zk)) - { - LOG_INFO(log, "Failed to get parts_to_do, skipping"); - return false; - } + // std::string parts_to_do_zk; + // if (!zk->tryGet(fs::path(export_partition_path) / "parts_to_do", parts_to_do_zk)) + // { + // LOG_INFO(log, "Failed to get parts_to_do, skipping"); + // return false; + // } - const auto parts_to_do = std::stoull(parts_to_do_zk.c_str()); + // const auto parts_to_do = std::stoull(parts_to_do_zk.c_str()); - if (parts_to_do == 0) - { - LOG_INFO(log, "Skipping... Parts to do is 0, maybe someone else already completed it?"); - return false; - } + // if (parts_to_do == 0) + // { + // LOG_INFO(log, "Skipping... Parts to do is 0, maybe someone else already completed it?"); + // return false; + // } /// only try to lock it if the status is still PENDING /// if we did not check for status = pending, chances are some other replica completed and released the lock in the meantime @@ -4681,7 +4689,7 @@ void StorageReplicatedMergeTree::selectPartsToExport() continue; } - if (lock_part(partition_export_path, part_path, status_path, lock_path, replica_name, zk)) + if (lock_part(part_path, status_path, lock_path, replica_name, zk)) { return manifest.parts[i]; } @@ -4702,7 +4710,7 @@ void StorageReplicatedMergeTree::selectPartsToExport() continue; } - if (lock_part(partition_export_path, part_path, status_path, lock_path, replica_name, zk)) + if (lock_part(part_path, status_path, lock_path, replica_name, zk)) { return manifest.parts[i]; } @@ -4711,160 +4719,207 @@ void StorageReplicatedMergeTree::selectPartsToExport() return std::nullopt; }; - const auto zk = getZooKeeper(); - - std::lock_guard lock(export_merge_tree_partition_mutex); - - for (auto & [key, task_entry] : export_merge_tree_partition_task_entries) - { - /// this sounds impossible, but just in case - if (task_entry.parts_to_do == 0) - { - LOG_INFO(log, "Already completed, skipping"); - continue; - } + try { + const auto zk = getZooKeeper(); - std::string parts_to_do_string; - if (!zk->tryGet(fs::path(exports_path) / key / "parts_to_do", parts_to_do_string)) + std::lock_guard lock(export_merge_tree_partition_mutex); + + for (auto & [key, task_entry] : export_merge_tree_partition_task_entries) { - LOG_INFO(log, "Failed to get parts_to_do, skipping"); - continue; - } + /// this sounds impossible, but just in case + if (task_entry.parts_to_do == 0) + { + LOG_INFO(log, "Already completed, skipping"); + continue; + } + + std::string parts_to_do_string; + if (!zk->tryGet(fs::path(exports_path) / key / "parts_to_do", parts_to_do_string)) + { + LOG_INFO(log, "Failed to get parts_to_do, skipping"); + continue; + } + + const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); + task_entry.parts_to_do = parts_to_do; + + if (task_entry.parts_to_do == 0) + { + LOG_INFO(log, "Already completed, skipping"); + continue; + } + + std::string status; + if (!zk->tryGet(fs::path(exports_path) / key / "status", status)) + { + LOG_INFO(log, "Failed to get status, skipping"); + continue; + } + + if (status != "PENDING") + { + LOG_INFO(log, "Skipping... Status is not PENDING"); + continue; + } + + const auto & manifest = task_entry.manifest; + const auto & database = getContext()->resolveDatabase(manifest.destination_database); + const auto & table = manifest.destination_table; + + const auto destination_storage_id = StorageID(QualifiedTableName {database, table}); + + const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, getContext()); - const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); - task_entry.parts_to_do = parts_to_do; - - if (task_entry.parts_to_do == 0) - { - LOG_INFO(log, "Already completed, skipping"); - continue; - } - - const auto & manifest = task_entry.manifest; - const auto & database = getContext()->resolveDatabase(manifest.destination_database); - const auto & table = manifest.destination_table; - - const auto destination_storage_id = StorageID(QualifiedTableName {database, table}); - - const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, getContext()); + if (!destination_storage) + { + LOG_INFO(log, "Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); + continue; + } - if (!destination_storage) - { - LOG_INFO(log, "Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); - continue; - } - - const auto partition_path = fs::path(exports_path) / key; - const auto next_idx_path = fs::path(partition_path) / "next_idx"; - std::string next_idx_string; - if (!zk->tryGet(next_idx_path, next_idx_string)) - { - LOG_INFO(log, "Failed to get next_idx, skipping"); - continue; - } - - const auto next_idx = std::stoull(next_idx_string.c_str()); - - const auto part_to_export = try_to_acquire_a_part(manifest, partition_path, next_idx, zk); - - const auto partition_id = manifest.partition_id; - const auto transaction_id = manifest.transaction_id; - - if (part_to_export.has_value()) - { - try + const auto partition_path = fs::path(exports_path) / key; + const auto next_idx_path = fs::path(partition_path) / "next_idx"; + std::string next_idx_string; + if (!zk->tryGet(next_idx_path, next_idx_string)) + { + LOG_INFO(log, "Failed to get next_idx, skipping"); + continue; + } + + const auto next_idx = std::stoull(next_idx_string.c_str()); + + const auto part_to_export = try_to_acquire_a_part(manifest, partition_path, next_idx, zk); + + const auto partition_id = manifest.partition_id; + const auto transaction_id = manifest.transaction_id; + + if (part_to_export.has_value()) { - exportPartToTable( - part_to_export.value(), - destination_storage_id, - getContext(), - [this, partition_id, transaction_id, exports_path, key, part_to_export, complete_part_export, partition_path, next_idx_path, next_idx, destination_storage] - (MergeTreeExportManifest::CompletionCallbackResult result) + try { - - const auto zk_client = getZooKeeper(); - if (result.success) + exportPartToTable( + part_to_export.value(), + destination_storage_id, + getContext(), + [this, partition_id, transaction_id, exports_path, key, part_to_export, complete_part_export, partition_path, next_idx_path, next_idx, destination_storage] + (MergeTreeExportManifest::CompletionCallbackResult result) { - - complete_part_export( - partition_path, - fs::path(exports_path) / key / "parts" / part_to_export.value(), - fs::path(exports_path) / key / "parts" / part_to_export.value() / "status", - fs::path(exports_path) / key / "parts_to_do", - fs::path(exports_path) / key / "parts" / part_to_export.value() / "lock", - next_idx_path, - next_idx, - result.relative_path_in_destination_storage, - destination_storage, - transaction_id, - partition_id, - zk_client); - - /// maybe get up to date from complete_parts_export? - std::lock_guard inner_lock(export_merge_tree_partition_mutex); - export_merge_tree_partition_task_entries[key].parts_to_do--; - - if (export_merge_tree_partition_task_entries[key].parts_to_do == 0) - { - export_merge_tree_partition_task_entries.erase(key); + const auto zk_client = getZooKeeper(); + if (result.success) + { + complete_part_export( + partition_path, + fs::path(partition_path) / "parts" / part_to_export.value(), + fs::path(partition_path) / "parts" / part_to_export.value() / "status", + fs::path(partition_path) / "parts_to_do", + fs::path(partition_path) / "parts" / part_to_export.value() / "lock", + next_idx_path, + next_idx, + result.relative_path_in_destination_storage, + destination_storage, + transaction_id, + partition_id, + zk_client); + + /// maybe get up to date from complete_parts_export? + std::lock_guard inner_lock(export_merge_tree_partition_mutex); + export_merge_tree_partition_task_entries[key].parts_to_do--; + + if (export_merge_tree_partition_task_entries[key].parts_to_do == 0) + { + export_merge_tree_partition_task_entries.erase(key); + } } - } - else - { - /// increment retry_count - /// if above threshhold, fail the entire export - hopefully it is safe to do so :D - /// I could also leave this for the cleanup thread, but will do it here for now. - - Coordination::Requests ops; - - std::string retry_count_string; - if (zk_client->tryGet(fs::path(exports_path) / key / "parts" / part_to_export.value() / "retry_count", retry_count_string)) + else { - std::size_t retry_count = std::stoull(retry_count_string.c_str()) + 1; - - //// todo arthur unhardcode this - if (retry_count > 3) + /// increment retry_count + /// if above threshhold, fail the entire export - hopefully it is safe to do so :D + /// I could also leave this for the cleanup thread, but will do it here for now. + + Coordination::Requests ops; + + std::string retry_count_string; + if (zk_client->tryGet(fs::path(partition_path) / "parts" / part_to_export.value() / "retry_count", retry_count_string)) { - ops.emplace_back(zkutil::makeRemoveRecursiveRequest(partition_path, 1000)); + std::size_t retry_count = std::stoull(retry_count_string.c_str()) + 1; + + //// todo arthur unhardcode this + if (retry_count >= 3) + { + /// instead of removing the entire partition, just mark the status as failed + ops.emplace_back(zkutil::makeSetRequest(fs::path(partition_path) / "status", "FAILED", -1)); + /// mark the part as failed as well + ops.emplace_back(zkutil::makeSetRequest(fs::path(partition_path) / "parts" / part_to_export.value() / "status", "FAILED", -1)); + } + else + { + ops.emplace_back(zkutil::makeSetRequest(fs::path(partition_path) / "parts" / part_to_export.value() / "retry_count", std::to_string(retry_count), -1)); + } + + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(partition_path) / "parts" / part_to_export.value() / "lock", -1)); } else { - ops.emplace_back(zkutil::makeSetRequest(fs::path(exports_path) / key / "parts" / part_to_export.value() / "retry_count", std::to_string(retry_count), -1)); - /// unlock the part - ops.emplace_back(zkutil::makeRemoveRequest(fs::path(exports_path) / key / "parts" / part_to_export.value() / "lock", -1)); + LOG_INFO(log, "Failed to get retry_count, will not try to update it"); + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(partition_path) / "parts" / part_to_export.value() / "lock", -1)); } + + std::size_t num_exceptions = 0; + + const auto exceptions_per_replica_path = fs::path(partition_path) / "exceptions_per_replica" / replica_name; + const auto count_path = exceptions_per_replica_path / "count"; + const auto last_exception_path = exceptions_per_replica_path / "last_exception"; + + if (zk_client->exists(exceptions_per_replica_path)) + { + std::string num_exceptions_string; + zk_client->tryGet(count_path, num_exceptions_string); + num_exceptions = std::stoull(num_exceptions_string.c_str()); + + ops.emplace_back(zkutil::makeSetRequest(last_exception_path / "part", part_to_export.value(), -1)); + ops.emplace_back(zkutil::makeSetRequest(last_exception_path / "exception", result.exception, -1)); + } + else + { + ops.emplace_back(zkutil::makeCreateRequest(exceptions_per_replica_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(count_path, "0", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(last_exception_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(last_exception_path / "part", part_to_export.value(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(last_exception_path / "exception", result.exception, zkutil::CreateMode::Persistent)); + } + + num_exceptions++; + ops.emplace_back(zkutil::makeSetRequest(count_path, std::to_string(num_exceptions), -1)); + + Coordination::Responses responses; + if (zk_client->tryMulti(ops, responses) != Coordination::Error::ZOK) + { + LOG_INFO(log, "All failure mechanism failed, will not try to update it"); + return; + } + } - else - { - LOG_INFO(log, "Failed to get retry_count, will not try to update it"); - ops.emplace_back(zkutil::makeRemoveRequest(fs::path(partition_path) / "parts" / part_to_export.value() / "lock", -1)); - } - - Coordination::Responses responses; - if (zk_client->tryMulti(ops, responses) != Coordination::Error::ZOK) - { - LOG_INFO(log, "All failure mechanism failed, will not try to update it"); - return; - } - - } - }); - } - catch (...) - { - /// failed to schedule the part export - tryLogCurrentException(log, __PRETTY_FUNCTION__); - - /// best-effort to remove the lock (actually, we should make sure the lock is released..) - zk->tryRemove(fs::path(partition_path) / "parts" / part_to_export.value() / "lock"); - - /// re-run after some time - export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); + }); + } + catch (...) + { + /// failed to schedule the part export + tryLogCurrentException(log, __PRETTY_FUNCTION__); + + /// best-effort to remove the lock (actually, we should make sure the lock is released..) + zk->tryRemove(fs::path(partition_path) / "parts" / part_to_export.value() / "lock"); + + /// re-run after some time + export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); + } + } - } } - + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); + } // export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); } From 690a74493ff013cfa2e260fdf08495c9eba709fe Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 14 Oct 2025 17:52:07 -0300 Subject: [PATCH 10/17] a few fixes --- src/Storages/MergeTree/MergeTreeData.cpp | 7 ------- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- .../02221_system_zookeeper_unrestricted_like.reference | 1 + .../0_stateless/03604_export_merge_tree_partition.sh | 3 +++ 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e2ae830544fc..52fdcddf0145 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6101,13 +6101,6 @@ void MergeTreeData::exportPartToTableImpl( CompletedPipelineExecutor exec(pipeline); exec.execute(); - volatile bool x = true; - - if (x) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Some issue"); - } - std::lock_guard inner_lock(export_manifests_mutex); writePartLog( PartLogElement::Type::EXPORT_PART, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e9da32e4d37b..163987ba8814 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1566,7 +1566,7 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper /// NOTE /block_numbers/ actually is not flat, because /block_numbers// may have ephemeral children, /// but we assume that all ephemeral block locks are already removed when table is being dropped. - static constexpr std::array flat_nodes = {"block_numbers", "blocks", "async_blocks", "leader_election", "log", "mutations", "pinned_part_uuids", "exports_cleanup_lock"}; + static constexpr std::array flat_nodes = {"block_numbers", "blocks", "async_blocks", "leader_election", "log", "mutations", "pinned_part_uuids"}; /// First try to remove paths that are known to be flat for (const auto * node : flat_nodes) diff --git a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference index fe4cbcad7b5d..7648b3161893 100644 --- a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference +++ b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference @@ -8,6 +8,7 @@ blocks columns columns creator_info +exports failed_parts flags host diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.sh b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh index 1198281afc72..ea77a2e3f6ff 100755 --- a/tests/queries/0_stateless/03604_export_merge_tree_partition.sh +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh @@ -24,6 +24,9 @@ query "INSERT INTO $rmt_table VALUES (3, 2020), (5, 2021)" query "INSERT INTO $rmt_table VALUES (6, 2022), (7, 2022)" +# sync replicas +query "SYSTEM SYNC REPLICA $rmt_table"s + query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2020' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2021' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" From dc07bd9348877954b9c7aa11d3d0e71c6ea31b94 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 15 Oct 2025 11:31:24 -0300 Subject: [PATCH 11/17] make dest storage id part of the key --- src/Storages/StorageReplicatedMergeTree.cpp | 10 +++++++--- .../0_stateless/03604_export_merge_tree_partition.sh | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 163987ba8814..f2b4d93401ad 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4398,7 +4398,7 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() if (cleanup_lock_acquired) { - bool has_expired = metadata.create_time < now - 45; + bool has_expired = metadata.create_time < now - 90; if (has_expired && is_not_pending) { @@ -4719,7 +4719,8 @@ void StorageReplicatedMergeTree::selectPartsToExport() return std::nullopt; }; - try { + try + { const auto zk = getZooKeeper(); std::lock_guard lock(export_merge_tree_partition_mutex); @@ -8534,6 +8535,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & const auto dest_database = query_context->resolveDatabase(command.to_database); const auto dest_table = command.to_table; + const auto dest_storage_id = StorageID(dest_database, dest_table); auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, dest_table}, query_context); if (dest_storage->getStorageID() == this->getStorageID()) @@ -8597,7 +8599,9 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & ops.emplace_back(zkutil::makeCreateRequest(exports_path, "", zkutil::CreateMode::Persistent)); } - const auto partition_exports_path = exports_path / partition_id; + const auto export_key = partition_id + "_" + dest_storage_id.getNameForLogs(); + + const auto partition_exports_path = fs::path(exports_path) / export_key; /// check if entry already exists if (zookeeper->exists(partition_exports_path)) diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.sh b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh index ea77a2e3f6ff..d27126469603 100755 --- a/tests/queries/0_stateless/03604_export_merge_tree_partition.sh +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh @@ -25,7 +25,7 @@ query "INSERT INTO $rmt_table VALUES (3, 2020), (5, 2021)" query "INSERT INTO $rmt_table VALUES (6, 2022), (7, 2022)" # sync replicas -query "SYSTEM SYNC REPLICA $rmt_table"s +query "SYSTEM SYNC REPLICA $rmt_table" query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2020' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" From dd622c829b5b3c1a70acda35a4cdf4dc0557ed38 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 16 Oct 2025 11:48:59 -0300 Subject: [PATCH 12/17] add system.replicated_partition_exports --- src/Storages/StorageReplicatedMergeTree.cpp | 55 +++++++ src/Storages/StorageReplicatedMergeTree.h | 4 + ...torageSystemReplicatedPartitionExports.cpp | 135 ++++++++++++++++++ .../StorageSystemReplicatedPartitionExports.h | 38 +++++ src/Storages/System/attachSystemTables.cpp | 2 + 5 files changed, 234 insertions(+) create mode 100644 src/Storages/System/StorageSystemReplicatedPartitionExports.cpp create mode 100644 src/Storages/System/StorageSystemReplicatedPartitionExports.h diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index f2b4d93401ad..3a484454ddbd 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -4924,6 +4925,60 @@ void StorageReplicatedMergeTree::selectPartsToExport() // export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); } +std::vector StorageReplicatedMergeTree::getPartitionExportsInfo() const +{ + std::vector infos; + + const auto zk = getZooKeeper(); + const auto exports_path = fs::path(zookeeper_path) / "exports"; + const auto children = zk->getChildren(exports_path); + + for (const auto & child : children) + { + ReplicatedPartitionExportInfo info; + + const auto export_partition_path = fs::path(exports_path) / child; + std::string metadata_json; + if (!zk->tryGet(export_partition_path / "metadata.json", metadata_json)) + { + LOG_INFO(log, "Skipping {}: missing metadata.json", child); + continue; + } + + std::string parts_to_do_string; + if (!zk->tryGet(export_partition_path / "parts_to_do", parts_to_do_string)) + { + LOG_INFO(log, "Skipping {}: missing parts_to_do", child); + continue; + } + + std::string status; + if (!zk->tryGet(export_partition_path / "status", status)) + { + LOG_INFO(log, "Skipping {}: missing status", child); + continue; + } + + const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + info.destination_database = metadata.destination_database; + info.destination_table = metadata.destination_table; + info.partition_id = metadata.partition_id; + info.transaction_id = metadata.transaction_id; + info.create_time = metadata.create_time; + info.source_replica = metadata.source_replica; + info.parts_count = metadata.number_of_parts; + info.parts_to_do = parts_to_do; + info.parts = metadata.parts; + info.status = status; + + infos.emplace_back(std::move(info)); + } + + return infos; +} + StorageReplicatedMergeTree::CreateMergeEntryResult StorageReplicatedMergeTree::createLogEntryToMergeParts( zkutil::ZooKeeperPtr & zookeeper, diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 715f32ea28dd..c519f1feddb3 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -98,6 +98,8 @@ namespace DB class ZooKeeperWithFaultInjection; using ZooKeeperWithFaultInjectionPtr = std::shared_ptr; +struct ReplicatedPartitionExportInfo; + class StorageReplicatedMergeTree final : public MergeTreeData { public: @@ -369,6 +371,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData using ShutdownDeadline = std::chrono::time_point; void waitForUniquePartsToBeFetchedByOtherReplicas(ShutdownDeadline shutdown_deadline); + std::vector getPartitionExportsInfo() const; + private: std::atomic_bool are_restoring_replica {false}; diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp new file mode 100644 index 000000000000..e207755f7fff --- /dev/null +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp @@ -0,0 +1,135 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Columns/ColumnString.h" +#include "Storages/VirtualColumnUtils.h" + + +namespace DB +{ + +ColumnsDescription StorageSystemReplicatedPartitionExports::getColumnsDescription() +{ + return ColumnsDescription + { + {"source_database", std::make_shared(), "Name of the source database."}, + {"source_table", std::make_shared(), "Name of the source table."}, + {"destination_database", std::make_shared(), "Name of the destination database."}, + {"destination_table", std::make_shared(), "Name of the destination table."}, + {"create_time", std::make_shared(), "Date and time when the export command was submitted"}, + {"partition_id", std::make_shared(), "ID of the partition"}, + {"transaction_id", std::make_shared(), "ID of the transaction."}, + {"source_replica", std::make_shared(), "Name of the source replica."}, + {"parts", std::make_shared(std::make_shared()), "List of part names to be exported."}, + {"parts_count", std::make_shared(), "Number of parts in the export."}, + {"parts_to_do", std::make_shared(), "Number of parts pending to be exported."}, + {"status", std::make_shared(), "Status of the export."}, + }; +} + +void StorageSystemReplicatedPartitionExports::fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node * predicate, std::vector) const +{ + const auto access = context->getAccess(); + const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); + + std::map> replicated_merge_tree_tables; + for (const auto & db : DatabaseCatalog::instance().getDatabases()) + { + /// Check if database can contain MergeTree tables + if (!db.second->canContainMergeTreeTables()) + continue; + + const bool check_access_for_tables = check_access_for_databases && !access->isGranted(AccessType::SHOW_TABLES, db.first); + + for (auto iterator = db.second->getTablesIterator(context); iterator->isValid(); iterator->next()) + { + const auto & table = iterator->table(); + if (!table) + continue; + + StorageReplicatedMergeTree * table_replicated = dynamic_cast(table.get()); + if (!table_replicated) + continue; + + if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, db.first, iterator->name())) + continue; + + replicated_merge_tree_tables[db.first][iterator->name()] = table; + } + } + + MutableColumnPtr col_database_mut = ColumnString::create(); + MutableColumnPtr col_table_mut = ColumnString::create(); + + for (auto & db : replicated_merge_tree_tables) + { + for (auto & table : db.second) + { + col_database_mut->insert(db.first); + col_table_mut->insert(table.first); + } + } + + ColumnPtr col_database = std::move(col_database_mut); + ColumnPtr col_table = std::move(col_table_mut); + + /// Determine what tables are needed by the conditions in the query. + { + Block filtered_block + { + { col_database, std::make_shared(), "database" }, + { col_table, std::make_shared(), "table" }, + }; + + VirtualColumnUtils::filterBlockWithPredicate(predicate, filtered_block, context); + + if (!filtered_block.rows()) + return; + + col_database = filtered_block.getByName("database").column; + col_table = filtered_block.getByName("table").column; + } + + for (size_t i_storage = 0; i_storage < col_database->size(); ++i_storage) + { + const auto database = (*col_database)[i_storage].safeGet(); + const auto table = (*col_table)[i_storage].safeGet(); + + std::vector partition_exports_info; + { + const IStorage * storage = replicated_merge_tree_tables[database][table].get(); + if (const auto * replicated_merge_tree = dynamic_cast(storage)) + partition_exports_info = replicated_merge_tree->getPartitionExportsInfo(); + } + + for (const ReplicatedPartitionExportInfo & info : partition_exports_info) + { + std::size_t i = 0; + res_columns[i++]->insert(database); + res_columns[i++]->insert(table); + res_columns[i++]->insert(info.destination_database); + res_columns[i++]->insert(info.destination_table); + res_columns[i++]->insert(info.create_time); + res_columns[i++]->insert(info.partition_id); + res_columns[i++]->insert(info.transaction_id); + res_columns[i++]->insert(info.source_replica); + Array parts_array; + parts_array.reserve(info.parts.size()); + for (const auto & part : info.parts) + parts_array.push_back(part); + res_columns[i++]->insert(parts_array); + res_columns[i++]->insert(info.parts_count); + res_columns[i++]->insert(info.parts_to_do); + res_columns[i++]->insert(info.status); + } + } +} + +} diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.h b/src/Storages/System/StorageSystemReplicatedPartitionExports.h new file mode 100644 index 000000000000..23e2471a92ff --- /dev/null +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +namespace DB +{ + +class Context; + +struct ReplicatedPartitionExportInfo +{ + String destination_database; + String destination_table; + String partition_id; + String transaction_id; + time_t create_time; + String source_replica; + size_t parts_count; + size_t parts_to_do; + std::vector parts; + String status; +}; + +class StorageSystemReplicatedPartitionExports final : public IStorageSystemOneBlock +{ +public: + + std::string getName() const override { return "SystemReplicatedPartitionExports"; } + + static ColumnsDescription getColumnsDescription(); + +protected: + using IStorageSystemOneBlock::IStorageSystemOneBlock; + + void fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node *, std::vector) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index a0f707b70a90..4f74c7f11cd1 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -1,4 +1,5 @@ #include "Storages/System/StorageSystemKeywords.h" +#include #include "config.h" #include @@ -209,6 +210,7 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b attach(context, system_database, "merges", "Contains a list of merges currently executing merges of MergeTree tables and their progress. Each merge operation is represented by a single row."); attach(context, system_database, "moves", "Contains information about in-progress data part moves of MergeTree tables. Each data part movement is represented by a single row."); attach(context, system_database, "exports", "Contains a list of exports currently executing exports of MergeTree tables and their progress. Each export operation is represented by a single row."); + attach(context, system_database, "replicated_partition_exports", "Contains a list of partition exports of ReplicatedMergeTree tables and their progress. Each export operation is represented by a single row."); attach(context, system_database, "mutations", "Contains a list of mutations and their progress. Each mutation command is represented by a single row."); attachNoDescription(context, system_database, "replicas", "Contains information and status of all table replicas on current server. Each replica is represented by a single row."); attach(context, system_database, "replication_queue", "Contains information about tasks from replication queues stored in ClickHouse Keeper, or ZooKeeper, for each table replica."); From e06d44b69b05be0add3b436d6e415612f6bc8250 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 17 Oct 2025 08:53:18 -0300 Subject: [PATCH 13/17] add exception to replicated systems table --- src/Storages/StorageReplicatedMergeTree.cpp | 35 +++++++++++++++++++ ...torageSystemReplicatedPartitionExports.cpp | 6 ++++ .../StorageSystemReplicatedPartitionExports.h | 3 ++ ...3604_export_merge_tree_partition.reference | 1 - 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 3a484454ddbd..63040d835bb3 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4960,6 +4960,38 @@ std::vector StorageReplicatedMergeTree::getPartit } const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); + + std::string last_exception; + std::string exception_part; + std::size_t exception_count = 0; + + const auto exceptions_per_replica_path = export_partition_path / "exceptions_per_replica"; + + const auto exception_children = zk->getChildren(exceptions_per_replica_path); + for (const auto & exception_child : exception_children) + { + std::string exception_count_string; + if (!zk->tryGet(exceptions_per_replica_path / exception_child / "count", exception_count_string)) + { + LOG_INFO(log, "Skipping {}: missing count", exception_child); + continue; + } + exception_count += std::stoull(exception_count_string.c_str()); + + if (last_exception.empty()) + { + const auto last_exception_path = exceptions_per_replica_path / exception_child / "last_exception"; + std::string last_exception_string; + if (!zk->tryGet(last_exception_path / "exception", last_exception_string)) + { + LOG_INFO(log, "Skipping {}: missing last_exception/exception", last_exception_path); + continue; + } + last_exception = last_exception_string; + exception_part = last_exception_path / "part"; + } + } + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); info.destination_database = metadata.destination_database; @@ -4972,6 +5004,9 @@ std::vector StorageReplicatedMergeTree::getPartit info.parts_to_do = parts_to_do; info.parts = metadata.parts; info.status = status; + info.last_exception = last_exception; + info.exception_part = exception_part; + info.exception_count = exception_count; infos.emplace_back(std::move(info)); } diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp index e207755f7fff..94dabe8bf36a 100644 --- a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp @@ -31,6 +31,9 @@ ColumnsDescription StorageSystemReplicatedPartitionExports::getColumnsDescriptio {"parts_count", std::make_shared(), "Number of parts in the export."}, {"parts_to_do", std::make_shared(), "Number of parts pending to be exported."}, {"status", std::make_shared(), "Status of the export."}, + {"last_exception", std::make_shared(), "Last exception message of any part (not necessarily the last global exception)"}, + {"exception_part", std::make_shared(), "Part that caused the last exception"}, + {"exception_count", std::make_shared(), "Number of global exceptions"}, }; } @@ -128,6 +131,9 @@ void StorageSystemReplicatedPartitionExports::fillData(MutableColumns & res_colu res_columns[i++]->insert(info.parts_count); res_columns[i++]->insert(info.parts_to_do); res_columns[i++]->insert(info.status); + res_columns[i++]->insert(info.last_exception); + res_columns[i++]->insert(info.exception_part); + res_columns[i++]->insert(info.exception_count); } } } diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.h b/src/Storages/System/StorageSystemReplicatedPartitionExports.h index 23e2471a92ff..de9b6bc1d7ff 100644 --- a/src/Storages/System/StorageSystemReplicatedPartitionExports.h +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.h @@ -19,6 +19,9 @@ struct ReplicatedPartitionExportInfo size_t parts_to_do; std::vector parts; String status; + std::string last_exception; + std::string exception_part; + size_t exception_count; }; class StorageSystemReplicatedPartitionExports final : public IStorageSystemOneBlock diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.reference b/tests/queries/0_stateless/03604_export_merge_tree_partition.reference index 97e17ae5aeec..2bda7b6165a4 100644 --- a/tests/queries/0_stateless/03604_export_merge_tree_partition.reference +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.reference @@ -1,4 +1,3 @@ - Select from source table 1 2020 2 2020 From 94bcf3e92f428a256e8a72ea1df2bf4d2f04645d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 17 Oct 2025 09:12:35 -0300 Subject: [PATCH 14/17] add the replica that caused the exception --- src/Storages/StorageReplicatedMergeTree.cpp | 23 ++++++++++++++----- ...torageSystemReplicatedPartitionExports.cpp | 2 ++ .../StorageSystemReplicatedPartitionExports.h | 1 + 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 63040d835bb3..34a32dcbfc1f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4961,34 +4961,44 @@ std::vector StorageReplicatedMergeTree::getPartit const auto parts_to_do = std::stoull(parts_to_do_string.c_str()); + std::string exception_replica; std::string last_exception; std::string exception_part; std::size_t exception_count = 0; const auto exceptions_per_replica_path = export_partition_path / "exceptions_per_replica"; - const auto exception_children = zk->getChildren(exceptions_per_replica_path); - for (const auto & exception_child : exception_children) + const auto exception_replicas = zk->getChildren(exceptions_per_replica_path); + for (const auto & replica : exception_replicas) { std::string exception_count_string; - if (!zk->tryGet(exceptions_per_replica_path / exception_child / "count", exception_count_string)) + if (!zk->tryGet(exceptions_per_replica_path / replica / "count", exception_count_string)) { - LOG_INFO(log, "Skipping {}: missing count", exception_child); + LOG_INFO(log, "Skipping {}: missing count", replica); continue; } exception_count += std::stoull(exception_count_string.c_str()); if (last_exception.empty()) { - const auto last_exception_path = exceptions_per_replica_path / exception_child / "last_exception"; + const auto last_exception_path = exceptions_per_replica_path / replica / "last_exception"; std::string last_exception_string; if (!zk->tryGet(last_exception_path / "exception", last_exception_string)) { LOG_INFO(log, "Skipping {}: missing last_exception/exception", last_exception_path); continue; } + + std::string exception_part_zk; + if (!zk->tryGet(last_exception_path / "part", exception_part_zk)) + { + LOG_INFO(log, "Skipping {}: missing exception part", last_exception_path); + continue; + } + + exception_replica = replica; last_exception = last_exception_string; - exception_part = last_exception_path / "part"; + exception_part = exception_part_zk; } } @@ -5004,6 +5014,7 @@ std::vector StorageReplicatedMergeTree::getPartit info.parts_to_do = parts_to_do; info.parts = metadata.parts; info.status = status; + info.exception_replica = exception_replica; info.last_exception = last_exception; info.exception_part = exception_part; info.exception_count = exception_count; diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp index 94dabe8bf36a..018f0c8ffac7 100644 --- a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp @@ -31,6 +31,7 @@ ColumnsDescription StorageSystemReplicatedPartitionExports::getColumnsDescriptio {"parts_count", std::make_shared(), "Number of parts in the export."}, {"parts_to_do", std::make_shared(), "Number of parts pending to be exported."}, {"status", std::make_shared(), "Status of the export."}, + {"exception_replica", std::make_shared(), "Replica that caused the last exception"}, {"last_exception", std::make_shared(), "Last exception message of any part (not necessarily the last global exception)"}, {"exception_part", std::make_shared(), "Part that caused the last exception"}, {"exception_count", std::make_shared(), "Number of global exceptions"}, @@ -131,6 +132,7 @@ void StorageSystemReplicatedPartitionExports::fillData(MutableColumns & res_colu res_columns[i++]->insert(info.parts_count); res_columns[i++]->insert(info.parts_to_do); res_columns[i++]->insert(info.status); + res_columns[i++]->insert(info.exception_replica); res_columns[i++]->insert(info.last_exception); res_columns[i++]->insert(info.exception_part); res_columns[i++]->insert(info.exception_count); diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.h b/src/Storages/System/StorageSystemReplicatedPartitionExports.h index de9b6bc1d7ff..de2547437c21 100644 --- a/src/Storages/System/StorageSystemReplicatedPartitionExports.h +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.h @@ -19,6 +19,7 @@ struct ReplicatedPartitionExportInfo size_t parts_to_do; std::vector parts; String status; + std::string exception_replica; std::string last_exception; std::string exception_part; size_t exception_count; From 8948a900455de02e8c7cefc444f49f3fe8e9e310 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 17 Oct 2025 09:57:01 -0300 Subject: [PATCH 15/17] export_merge_tree_partition_force_export --- src/Common/ZooKeeper/ZooKeeper.h | 2 +- src/Core/Settings.cpp | 3 ++ src/Storages/StorageReplicatedMergeTree.cpp | 40 ++++++++++++--------- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 5154d734f1be..657a60e1a6fa 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -378,8 +378,8 @@ class ZooKeeper Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL); using MultiGetChildrenResponse = MultiReadResponses; + using MultiTryGetChildrenResponse = MultiReadResponses; - template MultiGetChildrenResponse getChildren(TIter start, TIter end, Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL) diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 023c7951fc91..4238e0011c06 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -6706,6 +6706,9 @@ Possible values: )", 0) \ DECLARE(Bool, export_merge_tree_part_overwrite_file_if_exists, false, R"( Overwrite file if it already exists when exporting a merge tree part +)", 0) \ + DECLARE(Bool, export_merge_tree_partition_force_export, false, R"( +Ignore existing partition export and overwrite the zookeeper entry )", 0) \ \ /* ####################################################### */ \ diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 34a32dcbfc1f..765ace20379d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -188,6 +188,7 @@ namespace Setting extern const SettingsInt64 replication_wait_for_inactive_replica_timeout; extern const SettingsUInt64 select_sequential_consistency; extern const SettingsBool allow_experimental_export_merge_tree_part; + extern const SettingsBool export_merge_tree_partition_force_export; } namespace MergeTreeSetting @@ -4374,9 +4375,6 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() /// If we have the cleanup lock, also remove stale entries from zk and local for (const auto & key : zk_children) { - if (!cleanup_lock_acquired && export_merge_tree_partition_task_entries.contains(key)) - continue; - const std::string entry_path = fs::path(exports_path) / key; std::string metadata_json; @@ -4385,6 +4383,19 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() LOG_INFO(log, "Skipping {}: missing metadata.json", key); continue; } + + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + const auto local_entry = export_merge_tree_partition_task_entries.find(key); + + /// If the zk entry has been replaced with export_merge_tree_partition_force_export, checking only for the export key is not enough + /// we need to make sure it is the same transaction id. If it is not, it needs to be replaced. + bool has_local_entry_and_is_up_to_date = local_entry != export_merge_tree_partition_task_entries.end() + && local_entry->second.manifest.transaction_id == metadata.transaction_id; + + /// If the entry is up to date and we don't have the cleanup lock, early exit, nothing to be done. + if (!cleanup_lock_acquired && has_local_entry_and_is_up_to_date) + continue; std::string status; if (!zk->tryGet(fs::path(entry_path) / "status", status)) @@ -4395,8 +4406,6 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() bool is_not_pending = status != "PENDING"; - const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); - if (cleanup_lock_acquired) { bool has_expired = metadata.create_time < now - 90; @@ -4416,7 +4425,7 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() continue; } - if (export_merge_tree_partition_task_entries.contains(key)) + if (has_local_entry_and_is_up_to_date) { LOG_INFO(log, "Skipping {}: already exists", key); continue; @@ -4456,9 +4465,8 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() } } - export_merge_tree_partition_task_entries.emplace( - key, - ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do, std::move(part_references)}); + /// It is important to use the operator[] because it updates the existing entry if it already exists. + export_merge_tree_partition_task_entries[key] = ExportReplicatedMergeTreePartitionTaskEntry {metadata, parts_to_do, std::move(part_references)}; } /// Remove entries that were deleted by someone else @@ -8694,12 +8702,6 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & start_time: 123445 */ - /// maybe this should go in initialization somewhere else - if (!zookeeper->exists(exports_path)) - { - ops.emplace_back(zkutil::makeCreateRequest(exports_path, "", zkutil::CreateMode::Persistent)); - } - const auto export_key = partition_id + "_" + dest_storage_id.getNameForLogs(); const auto partition_exports_path = fs::path(exports_path) / export_key; @@ -8707,7 +8709,13 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & /// check if entry already exists if (zookeeper->exists(partition_exports_path)) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} already exported or it is being exported", partition_id); + if (!query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export]) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} already exported or it is being exported", partition_id); + } + + /// The check for existence and entry removal are not atomic, so this actually might fail. + ops.emplace_back(zkutil::makeRemoveRecursiveRequest(partition_exports_path, -1)); } ops.emplace_back(zkutil::makeCreateRequest(partition_exports_path, "", zkutil::CreateMode::Persistent)); From 8c02b951ced7a4a7ae1f479add4b922631a47101 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 21 Oct 2025 09:33:23 -0300 Subject: [PATCH 16/17] almost done with kill export partition --- .../InterpreterKillQueryQuery.cpp | 74 +++++++++++++++++ src/Parsers/ASTKillQueryQuery.cpp | 3 + src/Parsers/ASTKillQueryQuery.h | 1 + src/Parsers/ParserKillQueryQuery.cpp | 3 + src/Storages/IStorage.cpp | 5 ++ src/Storages/IStorage.h | 3 + src/Storages/MergeTree/MergeTreeData.cpp | 41 +++++++-- src/Storages/MergeTree/MergeTreeData.h | 16 ++-- ...nifest.h => MergeTreePartExportManifest.h} | 15 +++- ...rtStatus.h => MergeTreePartExportStatus.h} | 0 src/Storages/StorageReplicatedMergeTree.cpp | 83 +++++++++++++++---- src/Storages/StorageReplicatedMergeTree.h | 2 + 12 files changed, 212 insertions(+), 34 deletions(-) rename src/Storages/MergeTree/{MergeTreeExportManifest.h => MergeTreePartExportManifest.h} (84%) rename src/Storages/MergeTree/{MergeTreeExportStatus.h => MergeTreePartExportStatus.h} (100%) diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 94563b55f74e..405c33cac68a 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -250,6 +250,77 @@ BlockIO InterpreterKillQueryQuery::execute() break; } + case ASTKillQueryQuery::Type::ExportPartition: + { + Block exports_block = getSelectResult( + "source_database, source_table, transaction_id, destination_database, destination_table, partition_id", + "system.replicated_partition_exports"); + if (!exports_block) + return res_io; + + const ColumnString & src_db_col = typeid_cast(*exports_block.getByName("source_database").column); + const ColumnString & src_tbl_col = typeid_cast(*exports_block.getByName("source_table").column); + const ColumnString & dst_db_col = typeid_cast(*exports_block.getByName("destination_database").column); + const ColumnString & dst_tbl_col = typeid_cast(*exports_block.getByName("destination_table").column); + const ColumnString & tx_col = typeid_cast(*exports_block.getByName("transaction_id").column); + + auto header = exports_block.cloneEmpty(); + header.insert(0, {ColumnString::create(), std::make_shared(), "kill_status"}); + + MutableColumns res_columns = header.cloneEmptyColumns(); + auto table_id = StorageID::createEmpty(); + AccessRightsElements required_access_rights; + auto access = getContext()->getAccess(); + bool access_denied = false; + + for (size_t i = 0; i < exports_block.rows(); ++i) + { + const auto src_database = src_db_col.getDataAt(i).toString(); + const auto src_table = src_tbl_col.getDataAt(i).toString(); + const auto dst_database = dst_db_col.getDataAt(i).toString(); + const auto dst_table = dst_tbl_col.getDataAt(i).toString(); + + table_id = StorageID{src_database, src_table}; + auto transaction_id = tx_col.getDataAt(i).toString(); + + CancellationCode code = CancellationCode::Unknown; + if (!query.test) + { + auto storage = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + if (!storage) + code = CancellationCode::NotFound; + else + { + ASTAlterCommand alter_command{}; + alter_command.type = ASTAlterCommand::EXPORT_PARTITION; + alter_command.move_destination_type = DataDestinationType::TABLE; + alter_command.from_database = src_database; + alter_command.from_table = src_table; + alter_command.to_database = dst_database; + alter_command.to_table = dst_table; + + required_access_rights = InterpreterAlterQuery::getRequiredAccessForCommand( + alter_command, table_id.database_name, table_id.table_name); + if (!access->isGranted(required_access_rights)) + { + access_denied = true; + continue; + } + code = storage->killExportPartition(transaction_id); + } + } + + insertResultRow(i, code, exports_block, header, res_columns); + } + + if (res_columns[0]->empty() && access_denied) + throw Exception(ErrorCodes::ACCESS_DENIED, "Not allowed to kill export partition. " + "To execute this query, it's necessary to have the grant {}", required_access_rights.toString()); + + res_io.pipeline = QueryPipeline(Pipe(std::make_shared(header.cloneWithColumns(std::move(res_columns))))); + + break; + } case ASTKillQueryQuery::Type::Mutation: { Block mutations_block = getSelectResult("database, table, mutation_id, command", "system.mutations"); @@ -462,6 +533,9 @@ AccessRightsElements InterpreterKillQueryQuery::getRequiredAccessForDDLOnCluster | AccessType::ALTER_MATERIALIZE_COLUMN | AccessType::ALTER_MATERIALIZE_TTL ); + /// todo arthur think about this + else if (query.type == ASTKillQueryQuery::Type::ExportPartition) + required_access.emplace_back(AccessType::ALTER_EXPORT_PARTITION); return required_access; } diff --git a/src/Parsers/ASTKillQueryQuery.cpp b/src/Parsers/ASTKillQueryQuery.cpp index 4f51713c2f4d..3bd4ef9f324e 100644 --- a/src/Parsers/ASTKillQueryQuery.cpp +++ b/src/Parsers/ASTKillQueryQuery.cpp @@ -27,6 +27,9 @@ void ASTKillQueryQuery::formatQueryImpl(WriteBuffer & ostr, const FormatSettings case Type::Transaction: ostr << "TRANSACTION"; break; + case Type::ExportPartition: + ostr << "EXPORT PARTITION"; + break; } ostr << (settings.hilite ? hilite_none : ""); diff --git a/src/Parsers/ASTKillQueryQuery.h b/src/Parsers/ASTKillQueryQuery.h index 99a14c56d72b..13d2811534f0 100644 --- a/src/Parsers/ASTKillQueryQuery.h +++ b/src/Parsers/ASTKillQueryQuery.h @@ -13,6 +13,7 @@ class ASTKillQueryQuery : public ASTQueryWithOutput, public ASTQueryWithOnCluste { Query, /// KILL QUERY Mutation, /// KILL MUTATION + ExportPartition, /// KILL EXPORT_PARTITION PartMoveToShard, /// KILL PART_MOVE_TO_SHARD Transaction, /// KILL TRANSACTION }; diff --git a/src/Parsers/ParserKillQueryQuery.cpp b/src/Parsers/ParserKillQueryQuery.cpp index 55bd5100009e..7e06ae8d30b7 100644 --- a/src/Parsers/ParserKillQueryQuery.cpp +++ b/src/Parsers/ParserKillQueryQuery.cpp @@ -17,6 +17,7 @@ bool ParserKillQueryQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expect ParserKeyword p_kill{Keyword::KILL}; ParserKeyword p_query{Keyword::QUERY}; ParserKeyword p_mutation{Keyword::MUTATION}; + ParserKeyword p_export_partition{Keyword::EXPORT_PARTITION}; ParserKeyword p_part_move_to_shard{Keyword::PART_MOVE_TO_SHARD}; ParserKeyword p_transaction{Keyword::TRANSACTION}; ParserKeyword p_on{Keyword::ON}; @@ -33,6 +34,8 @@ bool ParserKillQueryQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expect query->type = ASTKillQueryQuery::Type::Query; else if (p_mutation.ignore(pos, expected)) query->type = ASTKillQueryQuery::Type::Mutation; + else if (p_export_partition.ignore(pos, expected)) + query->type = ASTKillQueryQuery::Type::ExportPartition; else if (p_part_move_to_shard.ignore(pos, expected)) query->type = ASTKillQueryQuery::Type::PartMoveToShard; else if (p_transaction.ignore(pos, expected)) diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index b192f750cdca..bfb910a096ad 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -300,6 +300,11 @@ CancellationCode IStorage::killPartMoveToShard(const UUID & /*task_uuid*/) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part moves between shards are not supported by storage {}", getName()); } +CancellationCode IStorage::killExportPartition(const String & /*transaction_id*/) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Export partition is not supported by storage {}", getName()); +} + StorageID IStorage::getStorageID() const { std::lock_guard lock(id_mutex); diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 95d5a0c0e1a5..df7c2281d82d 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -574,6 +574,9 @@ It is currently only implemented in StorageObjectStorage. virtual void setMutationCSN(const String & /*mutation_id*/, UInt64 /*csn*/); + /// Cancel a replicated partition export by transaction id. + virtual CancellationCode killExportPartition(const String & /*transaction_id*/); + /// Cancel a part move to shard. virtual CancellationCode killPartMoveToShard(const UUID & /*task_uuid*/); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 52fdcddf0145..289cd9ec2909 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5910,14 +5910,15 @@ void MergeTreeData::exportPartToTable(const PartitionCommand & command, ContextP const auto database_name = query_context->resolveDatabase(command.to_database); - exportPartToTable(part_name, StorageID{database_name, command.to_table}, query_context); + exportPartToTable(part_name, StorageID{database_name, command.to_table}, query_context->getCurrentQueryId(), query_context); } void MergeTreeData::exportPartToTable( const std::string & part_name, const StorageID & destination_storage_id, + const String & transaction_id, ContextPtr query_context, - std::function completion_callback) + std::function completion_callback) { auto dest_storage = DatabaseCatalog::instance().getTable(destination_storage_id, query_context); @@ -5950,9 +5951,10 @@ void MergeTreeData::exportPartToTable( part_name, getStorageID().getFullTableName()); { - MergeTreeExportManifest manifest( + MergeTreePartExportManifest manifest( dest_storage->getStorageID(), part, + transaction_id, query_context->getSettingsRef()[Setting::export_merge_tree_part_overwrite_file_if_exists], query_context->getSettingsRef()[Setting::output_format_parallel_formatting], completion_callback); @@ -5970,7 +5972,7 @@ void MergeTreeData::exportPartToTable( } void MergeTreeData::exportPartToTableImpl( - const MergeTreeExportManifest & manifest, + const MergeTreePartExportManifest & manifest, ContextPtr local_context) { auto metadata_snapshot = getInMemoryMetadataPtr(); @@ -6027,7 +6029,7 @@ void MergeTreeData::exportPartToTableImpl( export_manifests.erase(manifest); if (manifest.completion_callback) - manifest.completion_callback(MergeTreeExportManifest::CompletionCallbackResult::createFailure(e.message())); + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createFailure(e.message())); return; } @@ -6096,6 +6098,9 @@ void MergeTreeData::exportPartToTableImpl( pipeline.complete(sink); + /// oh boy, is there another way? + manifest.pipeline = &pipeline; + try { CompletedPipelineExecutor exec(pipeline); @@ -6119,7 +6124,7 @@ void MergeTreeData::exportPartToTableImpl( ProfileEvents::increment(ProfileEvents::PartsExportTotalMilliseconds, static_cast((*exports_list_entry)->elapsed * 1000)); if (manifest.completion_callback) - manifest.completion_callback(MergeTreeExportManifest::CompletionCallbackResult::createSuccess(destination_file_path)); + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createSuccess(destination_file_path)); } catch (const Exception & e) { @@ -6142,12 +6147,30 @@ void MergeTreeData::exportPartToTableImpl( export_manifests.erase(manifest); if (manifest.completion_callback) - manifest.completion_callback(MergeTreeExportManifest::CompletionCallbackResult::createFailure(e.message())); + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createFailure(e.message())); throw; } } +void MergeTreeData::killExportPart(const String & query_id) +{ + std::lock_guard lock(export_manifests_mutex); + + const auto it = std::find_if(export_manifests.begin(), export_manifests.end(), [&](const auto & manifest) + { + return manifest.query_id == query_id; + }); + + if (it == export_manifests.end()) + return; + + if (it->pipeline) + it->pipeline->cancel(); + + export_manifests.erase(it); +} + void MergeTreeData::movePartitionToShard(const ASTPtr & /*partition*/, bool /*move_part*/, const String & /*to*/, ContextPtr /*query_context*/) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MOVE PARTITION TO SHARD is not supported by storage {}", getName()); @@ -8881,7 +8904,9 @@ bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & assignee) } manifest.in_progress = assignee.scheduleMoveTask(std::make_shared( - [this, manifest] () mutable { + [this, &manifest] () mutable { + /// TODO arthur fix this: I need to be able to modify the real manifest + /// but grabbing it by reference is causing problems exportPartToTableImpl(manifest, getContext()); return true; }, diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 9ee5b4f18506..bf07c33d04ce 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -35,8 +35,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -908,8 +908,12 @@ class MergeTreeData : public IStorage, public WithMutableContext void exportPartToTable( const std::string & part_name, - const StorageID & destination_storage_id, ContextPtr query_context, - std::function completion_callback = {}); + const StorageID & destination_storage_id, + const String & transaction_id, + ContextPtr query_context, + std::function completion_callback = {}); + + void killExportPart(const String & query_id); virtual void exportPartitionToTable(const PartitionCommand &, ContextPtr) { @@ -917,7 +921,7 @@ class MergeTreeData : public IStorage, public WithMutableContext } void exportPartToTableImpl( - const MergeTreeExportManifest & manifest, + const MergeTreePartExportManifest & manifest, ContextPtr local_context); /// Checks that Partition could be dropped right now @@ -1166,7 +1170,7 @@ class MergeTreeData : public IStorage, public WithMutableContext mutable std::mutex export_manifests_mutex; - std::set export_manifests; + std::set export_manifests; PinnedPartUUIDsPtr getPinnedPartUUIDs() const; diff --git a/src/Storages/MergeTree/MergeTreeExportManifest.h b/src/Storages/MergeTree/MergeTreePartExportManifest.h similarity index 84% rename from src/Storages/MergeTree/MergeTreeExportManifest.h rename to src/Storages/MergeTree/MergeTreePartExportManifest.h index bcc480c8507e..3052bccec6bc 100644 --- a/src/Storages/MergeTree/MergeTreeExportManifest.h +++ b/src/Storages/MergeTree/MergeTreePartExportManifest.h @@ -1,10 +1,11 @@ #include #include +#include "QueryPipeline/QueryPipeline.h" namespace DB { -struct MergeTreeExportManifest +struct MergeTreePartExportManifest { using DataPartPtr = std::shared_ptr; @@ -30,14 +31,16 @@ struct MergeTreeExportManifest String exception; }; - MergeTreeExportManifest( + MergeTreePartExportManifest( const StorageID & destination_storage_id_, const DataPartPtr & data_part_, + const String & query_id_, bool overwrite_file_if_exists_, bool parallel_formatting_, std::function completion_callback_ = {}) : destination_storage_id(destination_storage_id_), data_part(data_part_), + query_id(query_id_), overwrite_file_if_exists(overwrite_file_if_exists_), parallel_formatting(parallel_formatting_), completion_callback(completion_callback_), @@ -45,6 +48,8 @@ struct MergeTreeExportManifest StorageID destination_storage_id; DataPartPtr data_part; + /// Used for killing the export. + String query_id; bool overwrite_file_if_exists; bool parallel_formatting; @@ -53,8 +58,10 @@ struct MergeTreeExportManifest time_t create_time; mutable bool in_progress = false; + /// Used for killing the export + mutable QueryPipeline * pipeline = nullptr; - bool operator<(const MergeTreeExportManifest & rhs) const + bool operator<(const MergeTreePartExportManifest & rhs) const { // Lexicographic comparison: first compare destination storage, then part name auto lhs_storage = destination_storage_id.getQualifiedName(); @@ -66,7 +73,7 @@ struct MergeTreeExportManifest return data_part->name < rhs.data_part->name; } - bool operator==(const MergeTreeExportManifest & rhs) const + bool operator==(const MergeTreePartExportManifest & rhs) const { return destination_storage_id.getQualifiedName() == rhs.destination_storage_id.getQualifiedName() && data_part->name == rhs.data_part->name; diff --git a/src/Storages/MergeTree/MergeTreeExportStatus.h b/src/Storages/MergeTree/MergeTreePartExportStatus.h similarity index 100% rename from src/Storages/MergeTree/MergeTreeExportStatus.h rename to src/Storages/MergeTree/MergeTreePartExportStatus.h diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 765ace20379d..e27b22e6e68e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -119,7 +119,6 @@ #include "QueryPipeline/QueryPlanResourceHolder.h" #include "Storages/ExportReplicatedMergeTreePartitionManifest.h" #include "Storages/ExportReplicatedMergeTreePartitionTaskEntry.h" -#include #include #include #include @@ -4471,7 +4470,27 @@ void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() /// Remove entries that were deleted by someone else std::erase_if(export_merge_tree_partition_task_entries, - [&](auto const & kv) { return !zk_children.contains(kv.first); }); + [&](auto const & kv) + { + if (zk_children.contains(kv.first)) + { + return false; + } + + const auto & transaction_id = kv.second.manifest.transaction_id; + LOG_INFO(log, "Export task {} was deleted, calling killExportPartition for transaction {}", kv.first, transaction_id); + + try + { + killExportPart(transaction_id); + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + } + + return true; + }); if (cleanup_lock_acquired) { @@ -4809,9 +4828,10 @@ void StorageReplicatedMergeTree::selectPartsToExport() exportPartToTable( part_to_export.value(), destination_storage_id, + transaction_id, getContext(), [this, partition_id, transaction_id, exports_path, key, part_to_export, complete_part_export, partition_path, next_idx_path, next_idx, destination_storage] - (MergeTreeExportManifest::CompletionCallbackResult result) + (MergeTreePartExportManifest::CompletionCallbackResult result) { const auto zk_client = getZooKeeper(); if (result.success) @@ -8724,18 +8744,6 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & const auto parts = getDataPartsVectorInPartitionForInternalUsage(MergeTreeDataPartState::Active, partition_id, &data_parts_lock); - // const Strings parts = zookeeper->getChildren(fs::path(replica_path) / "parts"); - // const ActiveDataPartSet active_parts_set(format_version, parts); - // const auto part_infos = active_parts_set.getPartInfos(); - // std::vector parts_to_export; - // for (const auto & part : parts) - // { - // if (part_info.getPartitionId() == partition_id) - // { - // parts_to_export.push_back(part_info.getPartNameV1()); - // } - // } - if (parts.empty()) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} doesn't exist", partition_id); @@ -8751,7 +8759,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & ExportReplicatedMergeTreePartitionManifest manifest; - manifest.transaction_id = std::to_string(generateSnowflakeID()); + manifest.transaction_id = query_context->getCurrentQueryId(); manifest.partition_id = partition_id; manifest.destination_database = dest_database; manifest.destination_table = dest_table; @@ -10066,6 +10074,49 @@ CancellationCode StorageReplicatedMergeTree::killPartMoveToShard(const UUID & ta return part_moves_between_shards_orchestrator.killPartMoveToShard(task_uuid); } +CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & transaction_id) +{ + auto zk = getZooKeeper(); + const auto exports_path = fs::path(zookeeper_path) / "exports"; + + const auto export_keys = zk->getChildren(exports_path); + String export_key_to_be_cancelled; + for (const auto & export_key : export_keys) + { + std::string status; + if (!zk->tryGet(fs::path(exports_path) / export_key / "status", status)) + continue; + if (status != "PENDING") + continue; + + std::string metadata_json; + if (!zk->tryGet(fs::path(exports_path) / export_key / "metadata.json", metadata_json)) + continue; + const auto manifest = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + if (manifest.transaction_id == transaction_id) + { + export_key_to_be_cancelled = export_key; + break; + } + } + + if (export_key_to_be_cancelled.empty()) + return CancellationCode::NotFound; + + try + { + /// Once the entry is removed from zk, the update task will be triggered in all replicas + /// The logic for cancelling individual part exports will be triggered in the update task. + zk->removeRecursive(fs::path(exports_path) / export_key_to_be_cancelled); + return CancellationCode::CancelSent; + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + return CancellationCode::CancelCannotBeSent; + } +} + void StorageReplicatedMergeTree::getCommitPartOps( Coordination::Requests & ops, const DataPartPtr & part, diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index c519f1feddb3..9db5d37340b7 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -13,6 +13,7 @@ #include #include #include +#include "Interpreters/CancellationCode.h" #include #include #include @@ -934,6 +935,7 @@ class StorageReplicatedMergeTree final : public MergeTreeData void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr query_context) override; void movePartitionToShard(const ASTPtr & partition, bool move_part, const String & to, ContextPtr query_context) override; CancellationCode killPartMoveToShard(const UUID & task_uuid) override; + CancellationCode killExportPartition(const String & transaction_id) override; void fetchPartition( const ASTPtr & partition, const StorageMetadataPtr & metadata_snapshot, From 2fbf3955f662e31cc1d0926dc64ac907ab0cef24 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 22 Oct 2025 09:42:25 -0300 Subject: [PATCH 17/17] working kill export, update next idx upon lock and lock as many parts as one can at once --- src/Storages/MergeTree/ExportPartTask.cpp | 247 ++++++++++++++++++ src/Storages/MergeTree/ExportPartTask.h | 32 +++ src/Storages/MergeTree/MergeTreeData.cpp | 43 ++- src/Storages/MergeTree/MergeTreeData.h | 1 + .../MergeTree/MergeTreePartExportManifest.h | 9 +- src/Storages/StorageReplicatedMergeTree.cpp | 74 +++--- 6 files changed, 348 insertions(+), 58 deletions(-) create mode 100644 src/Storages/MergeTree/ExportPartTask.cpp create mode 100644 src/Storages/MergeTree/ExportPartTask.h diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp new file mode 100644 index 000000000000..4b247946c76a --- /dev/null +++ b/src/Storages/MergeTree/ExportPartTask.cpp @@ -0,0 +1,247 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event PartsExportDuplicated; + extern const Event PartsExportFailures; + extern const Event PartsExports; + extern const Event PartsExportTotalMilliseconds; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_TABLE; + extern const int FILE_ALREADY_EXISTS; + extern const int LOGICAL_ERROR; +} + +namespace Setting +{ + extern const SettingsUInt64 min_bytes_to_use_direct_io; +} + +ExportPartTask::ExportPartTask(MergeTreeData & storage_, const MergeTreePartExportManifest & manifest_, ContextPtr context_) + : storage(storage_), + manifest(manifest_), + local_context(context_) +{ +} + +bool ExportPartTask::executeStep() +{ + auto metadata_snapshot = storage.getInMemoryMetadataPtr(); + Names columns_to_read = metadata_snapshot->getColumns().getNamesOfPhysical(); + StorageSnapshotPtr storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, local_context); + + MergeTreeSequentialSourceType read_type = MergeTreeSequentialSourceType::Export; + + NamesAndTypesList partition_columns; + if (metadata_snapshot->hasPartitionKey()) + { + const auto & partition_key = metadata_snapshot->getPartitionKey(); + if (!partition_key.column_names.empty()) + partition_columns = partition_key.expression->getRequiredColumnsWithTypes(); + } + + auto block_with_partition_values = manifest.data_part->partition.getBlockWithPartitionValues(partition_columns); + + auto destination_storage = DatabaseCatalog::instance().tryGetTable(manifest.destination_storage_id, local_context); + if (!destination_storage) + { + std::lock_guard inner_lock(storage.export_manifests_mutex); + + const auto destination_storage_id_name = manifest.destination_storage_id.getNameForLogs(); + storage.export_manifests.erase(manifest); + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Failed to reconstruct destination storage: {}", destination_storage_id_name); + } + + SinkToStoragePtr sink; + std::string destination_file_path; + + try + { + auto context_copy = Context::createCopy(local_context); + context_copy->setSetting("output_format_parallel_formatting", manifest.parallel_formatting); + + sink = destination_storage->import( + manifest.data_part->name + "_" + manifest.data_part->checksums.getTotalChecksumHex(), + block_with_partition_values, + destination_file_path, + manifest.overwrite_file_if_exists, + context_copy); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::FILE_ALREADY_EXISTS) + { + ProfileEvents::increment(ProfileEvents::PartsExportDuplicated); + } + + ProfileEvents::incrementNoTrace(ProfileEvents::PartsExportFailures); + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.export_manifests.erase(manifest); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createFailure(e.message())); + return false; + } + + bool apply_deleted_mask = true; + bool read_with_direct_io = local_context->getSettingsRef()[Setting::min_bytes_to_use_direct_io] > manifest.data_part->getBytesOnDisk(); + bool prefetch = false; + + MergeTreeData::IMutationsSnapshot::Params params + { + .metadata_version = metadata_snapshot->getMetadataVersion(), + .min_part_metadata_version = manifest.data_part->getMetadataVersion(), + }; + + auto mutations_snapshot = storage.getMutationsSnapshot(params); + + auto alter_conversions = MergeTreeData::getAlterConversionsForPart( + manifest.data_part, + mutations_snapshot, + local_context); + + QueryPlan plan_for_part; + + createReadFromPartStep( + read_type, + plan_for_part, + storage, + storage_snapshot, + RangesInDataPart(manifest.data_part), + alter_conversions, + nullptr, + columns_to_read, + nullptr, + apply_deleted_mask, + std::nullopt, + read_with_direct_io, + prefetch, + local_context, + getLogger("ExportPartition")); + + auto exports_list_entry = storage.getContext()->getExportsList().insert( + getStorageID(), + manifest.destination_storage_id, + manifest.data_part->getBytesOnDisk(), + manifest.data_part->name, + destination_file_path, + manifest.data_part->rows_count, + manifest.data_part->getBytesOnDisk(), + manifest.data_part->getBytesUncompressedOnDisk(), + manifest.create_time, + local_context); + + ThreadGroupSwitcher switcher((*exports_list_entry)->thread_group, ""); + + QueryPlanOptimizationSettings optimization_settings(local_context); + auto pipeline_settings = BuildQueryPipelineSettings(local_context); + auto builder = plan_for_part.buildQueryPipeline(optimization_settings, pipeline_settings); + + builder->setProgressCallback([&exports_list_entry](const Progress & progress) + { + (*exports_list_entry)->bytes_read_uncompressed += progress.read_bytes; + (*exports_list_entry)->rows_read += progress.read_rows; + (*exports_list_entry)->elapsed = (*exports_list_entry)->watch.elapsedSeconds(); + }); + + pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); + + pipeline.complete(sink); + + try + { + CompletedPipelineExecutor exec(pipeline); + exec.execute(); + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.writePartLog( + PartLogElement::Type::EXPORT_PART, + {}, + static_cast((*exports_list_entry)->elapsed * 1000000000), + manifest.data_part->name, + manifest.data_part, + {manifest.data_part}, + nullptr, + nullptr, + exports_list_entry.get()); + + storage.export_manifests.erase(manifest); + + ProfileEvents::increment(ProfileEvents::PartsExports); + ProfileEvents::increment(ProfileEvents::PartsExportTotalMilliseconds, static_cast((*exports_list_entry)->elapsed * 1000)); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createSuccess(destination_file_path)); + } + catch (const Exception & e) + { + tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while exporting the part {}. User should retry.", manifest.data_part->name)); + + ProfileEvents::increment(ProfileEvents::PartsExportFailures); + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.writePartLog( + PartLogElement::Type::EXPORT_PART, + ExecutionStatus::fromCurrentException("", true), + static_cast((*exports_list_entry)->elapsed * 1000000000), + manifest.data_part->name, + manifest.data_part, + {manifest.data_part}, + nullptr, + nullptr, + exports_list_entry.get()); + + storage.export_manifests.erase(manifest); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createFailure(e.message())); + + throw; + } + return false; +} + +void ExportPartTask::cancel() noexcept +{ + pipeline.cancel(); +} + +void ExportPartTask::onCompleted() +{ +} + +StorageID ExportPartTask::getStorageID() const +{ + return manifest.destination_storage_id; +} + +Priority ExportPartTask::getPriority() const +{ + return Priority{}; +} + +String ExportPartTask::getQueryId() const +{ + return manifest.query_id; +} + +} diff --git a/src/Storages/MergeTree/ExportPartTask.h b/src/Storages/MergeTree/ExportPartTask.h new file mode 100644 index 000000000000..790ef9f6ecba --- /dev/null +++ b/src/Storages/MergeTree/ExportPartTask.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class ExportPartTask : public IExecutableTask +{ +public: + explicit ExportPartTask( + MergeTreeData & storage_, + const MergeTreePartExportManifest & manifest_, + ContextPtr context_); + bool executeStep() override; + void onCompleted() override; + StorageID getStorageID() const override; + Priority getPriority() const override; + String getQueryId() const override; + + void cancel() noexcept override; + +private: + MergeTreeData & storage; + MergeTreePartExportManifest manifest; + ContextPtr local_context; + QueryPipeline pipeline; +}; + +} diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 289cd9ec2909..252d8b500a28 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -24,6 +24,7 @@ #include #include #include +#include "Storages/MergeTree/ExportPartTask.h" #include #include #include @@ -6098,9 +6099,6 @@ void MergeTreeData::exportPartToTableImpl( pipeline.complete(sink); - /// oh boy, is there another way? - manifest.pipeline = &pipeline; - try { CompletedPipelineExecutor exec(pipeline); @@ -6157,18 +6155,16 @@ void MergeTreeData::killExportPart(const String & query_id) { std::lock_guard lock(export_manifests_mutex); - const auto it = std::find_if(export_manifests.begin(), export_manifests.end(), [&](const auto & manifest) + std::erase_if(export_manifests, [&](const auto & manifest) { - return manifest.query_id == query_id; + if (manifest.query_id == query_id) + { + if (manifest.task) + manifest.task->cancel(); + return true; + } + return false; }); - - if (it == export_manifests.end()) - return; - - if (it->pipeline) - it->pipeline->cancel(); - - export_manifests.erase(it); } void MergeTreeData::movePartitionToShard(const ASTPtr & /*partition*/, bool /*move_part*/, const String & /*to*/, ContextPtr /*query_context*/) @@ -8903,17 +8899,18 @@ bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & assignee) continue; } - manifest.in_progress = assignee.scheduleMoveTask(std::make_shared( - [this, &manifest] () mutable { - /// TODO arthur fix this: I need to be able to modify the real manifest - /// but grabbing it by reference is causing problems - exportPartToTableImpl(manifest, getContext()); - return true; - }, - moves_assignee_trigger, - getStorageID())); + auto task = std::make_shared(*this, manifest, getContext()); + + manifest.in_progress = assignee.scheduleMoveTask(task); + + if (!manifest.in_progress) + { + continue; + } - return manifest.in_progress; + manifest.task = task; + + return true; } return false; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index bf07c33d04ce..8f7cd253aa52 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1258,6 +1258,7 @@ class MergeTreeData : public IStorage, public WithMutableContext friend class MergeTask; friend class IPartMetadataManager; friend class IMergedBlockOutputStream; // for access to log + friend class ExportPartTask; bool require_part_metadata; diff --git a/src/Storages/MergeTree/MergeTreePartExportManifest.h b/src/Storages/MergeTree/MergeTreePartExportManifest.h index 3052bccec6bc..dd55b21c8a23 100644 --- a/src/Storages/MergeTree/MergeTreePartExportManifest.h +++ b/src/Storages/MergeTree/MergeTreePartExportManifest.h @@ -1,10 +1,14 @@ +#pragma once + #include #include -#include "QueryPipeline/QueryPipeline.h" +#include namespace DB { +class ExportPartTask; + struct MergeTreePartExportManifest { using DataPartPtr = std::shared_ptr; @@ -58,8 +62,7 @@ struct MergeTreePartExportManifest time_t create_time; mutable bool in_progress = false; - /// Used for killing the export - mutable QueryPipeline * pipeline = nullptr; + mutable std::shared_ptr task = nullptr; bool operator<(const MergeTreePartExportManifest & rhs) const { diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e27b22e6e68e..a2bfbf92416e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4656,21 +4656,6 @@ void StorageReplicatedMergeTree::selectPartsToExport() return false; } - // std::string parts_to_do_zk; - // if (!zk->tryGet(fs::path(export_partition_path) / "parts_to_do", parts_to_do_zk)) - // { - // LOG_INFO(log, "Failed to get parts_to_do, skipping"); - // return false; - // } - - // const auto parts_to_do = std::stoull(parts_to_do_zk.c_str()); - - // if (parts_to_do == 0) - // { - // LOG_INFO(log, "Skipping... Parts to do is 0, maybe someone else already completed it?"); - // return false; - // } - /// only try to lock it if the status is still PENDING /// if we did not check for status = pending, chances are some other replica completed and released the lock in the meantime ops.emplace_back(zkutil::makeCheckRequest(status_path, stat.version)); @@ -4701,9 +4686,31 @@ void StorageReplicatedMergeTree::selectPartsToExport() auto try_to_acquire_a_part = [&]( const ExportReplicatedMergeTreePartitionManifest & manifest, const std::string & partition_export_path, - const std::size_t next_idx, + std::size_t & next_idx, const ZooKeeperPtr & zk) -> std::optional { + const auto try_to_update_next_idx = [&](std::size_t local_next_idx) + { + /// Update next_idx - best effort, if it fails, that is ok + Coordination::Stat next_idx_stat; + std::string next_idx_string; + const auto next_idx_path = fs::path(partition_export_path) / "next_idx"; + + if (zk->tryGet(next_idx_path, next_idx_string, &next_idx_stat)) + { + const std::size_t next_idx_zk = std::stoul(next_idx_string.c_str()); + const std::size_t new_next_idx = std::max(next_idx_zk, local_next_idx + 1); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCheckRequest(next_idx_path, next_idx_stat.version)); + ops.emplace_back(zkutil::makeSetRequest(next_idx_path, std::to_string(new_next_idx), next_idx_stat.version)); + Coordination::Responses responses; + if (zk->tryMulti(ops, responses) != Coordination::Error::ZOK) + { + LOG_INFO(log, "Updated next_idx to {}", new_next_idx); + } + } + }; + for (auto i = next_idx; i < manifest.number_of_parts; i++) { const auto part_path = fs::path(partition_export_path) / "parts" / manifest.parts[i]; @@ -4719,6 +4726,8 @@ void StorageReplicatedMergeTree::selectPartsToExport() if (lock_part(part_path, status_path, lock_path, replica_name, zk)) { + next_idx = i; + try_to_update_next_idx(i); return manifest.parts[i]; } } @@ -4740,6 +4749,8 @@ void StorageReplicatedMergeTree::selectPartsToExport() if (lock_part(part_path, status_path, lock_path, replica_name, zk)) { + next_idx = i; + try_to_update_next_idx(i); return manifest.parts[i]; } } @@ -4790,21 +4801,21 @@ void StorageReplicatedMergeTree::selectPartsToExport() LOG_INFO(log, "Skipping... Status is not PENDING"); continue; } - + const auto & manifest = task_entry.manifest; const auto & database = getContext()->resolveDatabase(manifest.destination_database); const auto & table = manifest.destination_table; - + const auto destination_storage_id = StorageID(QualifiedTableName {database, table}); - + const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, getContext()); - + if (!destination_storage) { LOG_INFO(log, "Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); continue; } - + const auto partition_path = fs::path(exports_path) / key; const auto next_idx_path = fs::path(partition_path) / "next_idx"; std::string next_idx_string; @@ -4813,16 +4824,13 @@ void StorageReplicatedMergeTree::selectPartsToExport() LOG_INFO(log, "Failed to get next_idx, skipping"); continue; } - - const auto next_idx = std::stoull(next_idx_string.c_str()); - - const auto part_to_export = try_to_acquire_a_part(manifest, partition_path, next_idx, zk); - - const auto partition_id = manifest.partition_id; - const auto transaction_id = manifest.transaction_id; - - if (part_to_export.has_value()) + + std::size_t next_idx = std::stoull(next_idx_string.c_str()); + while (const auto part_to_export = try_to_acquire_a_part(manifest, partition_path, next_idx, zk)) { + const auto partition_id = manifest.partition_id; + const auto transaction_id = manifest.transaction_id; + try { exportPartToTable( @@ -4937,11 +4945,14 @@ void StorageReplicatedMergeTree::selectPartsToExport() /// best-effort to remove the lock (actually, we should make sure the lock is released..) zk->tryRemove(fs::path(partition_path) / "parts" / part_to_export.value() / "lock"); + + /// todo arthur should I try to rewind the next_idx? /// re-run after some time export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); + + break; } - } } } @@ -4950,7 +4961,6 @@ void StorageReplicatedMergeTree::selectPartsToExport() tryLogCurrentException(log, __PRETTY_FUNCTION__); export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); } - // export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); } std::vector StorageReplicatedMergeTree::getPartitionExportsInfo() const