Skip to content

Commit d2a5361

Browse files
committed
iotune: Add physical block size detection
Implement empirical detection of physical block size by testing write performance at different alignments (512, 1K, 2K, 4K, 8K bytes). This addresses the issue raised in PR scylladb#3046 where some disks lie about their physical_block_size via sysfs. The detection algorithm: 1. Performs random write tests at each alignment 2. Measures IOPS for each alignment with detailed logging 3. Finds the smallest alignment where performance plateaus (when the next larger alignment doesn't improve IOPS by more than 5%) 4. Compares detected value with driver-reported value and warns if they differ 5. Writes the detected value to io_properties.yaml The reactor can then use this empirically-determined value to override what the disk reports, preventing write amplification due to hardware-level read-modify-write cycles. Signed-off-by: Kefu Chai <k.chai@proxmox.com>
1 parent 157ab57 commit d2a5361

File tree

1 file changed

+111
-0
lines changed

1 file changed

+111
-0
lines changed

apps/iotune/iotune.cc

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <optional>
2727
#include <random>
2828
#include <memory>
29+
#include <map>
2930
#include <ranges>
3031
#include <vector>
3132
#include <cmath>
@@ -106,6 +107,7 @@ struct evaluation_directory {
106107
uint64_t _available_space;
107108
uint64_t _min_data_transfer_size = 512;
108109
unsigned _disks_per_array = 0;
110+
std::optional<uint64_t> _reported_physical_block_size;
109111

110112
void scan_device(unsigned dev_maj, unsigned dev_min) {
111113
scan_device(fmt::format("{}:{}", dev_maj, dev_min));
@@ -140,6 +142,15 @@ struct evaluation_directory {
140142

141143
_min_data_transfer_size = std::max(_min_data_transfer_size, disk_min_io_size);
142144
_max_iodepth += read_first_line_as<uint64_t>(queue_dir / "nr_requests");
145+
146+
// Read physical_block_size from sysfs for comparison with detected value
147+
if (!_reported_physical_block_size) {
148+
auto pbs_file = queue_dir / "physical_block_size";
149+
if (fs::exists(pbs_file)) {
150+
_reported_physical_block_size = read_first_line_as<uint64_t>(pbs_file);
151+
}
152+
}
153+
143154
_disks_per_array++;
144155
}
145156
} catch (std::system_error& se) {
@@ -178,6 +189,10 @@ struct evaluation_directory {
178189
return _min_data_transfer_size;
179190
}
180191

192+
std::optional<uint64_t> reported_physical_block_size() const {
193+
return _reported_physical_block_size;
194+
}
195+
181196
future<> discover_directory() {
182197
return seastar::async([this] {
183198
auto f = open_directory(_name).get();
@@ -674,6 +689,90 @@ class iotune_multi_shard_context {
674689
return saturate(rate_threshold, buffer_size, duration, &test_file::read_workload);
675690
}
676691

692+
// Detect physical block size by testing write performance at different alignments.
693+
// Returns the smallest alignment where write performance is good (no RMW penalty).
694+
// Tests alignments from 512 bytes up to 8192 bytes.
695+
//
696+
// The detection works by measuring write IOPS at each alignment. If writes smaller
697+
// than the physical block size are issued, the disk must perform read-modify-write
698+
// (RMW) operations, which reduces IOPS. Once we reach the physical block size,
699+
// IOPS should plateau as larger alignments don't provide additional benefit.
700+
future<uint64_t> detect_physical_block_size(std::chrono::duration<double> duration, std::optional<uint64_t> reported_pbs = std::nullopt) {
701+
return seastar::async([this, duration, reported_pbs] {
702+
// Test alignments: 512, 1024, 2048, 4096, 8192
703+
// This covers most common physical block sizes
704+
std::vector<uint64_t> test_alignments = {512, 1024, 2048, 4096, 8192};
705+
std::map<uint64_t, float> alignment_to_iops;
706+
707+
iotune_logger.info("Detecting physical block size by testing write performance at different alignments");
708+
709+
for (auto alignment : test_alignments) {
710+
// Perform a short write test at this alignment (5% of total test duration)
711+
auto rates = write_random_data(alignment, duration * 0.05).get();
712+
alignment_to_iops[alignment] = rates.iops;
713+
714+
iotune_logger.info("Alignment {} bytes: {} IOPS", alignment, uint64_t(rates.iops));
715+
716+
// Clear rates after each test
717+
sharded_rates.invoke_on_all([] (std::vector<unsigned>& rates) {
718+
rates.clear();
719+
return make_ready_future<>();
720+
}).get();
721+
}
722+
723+
// Find the smallest alignment where performance plateaus.
724+
// We look for the point where increasing alignment doesn't significantly improve IOPS.
725+
// This indicates we've reached or exceeded the physical block size.
726+
//
727+
// Algorithm: Find the first alignment where the next larger alignment doesn't
728+
// improve IOPS by more than 5%. This is the physical block size.
729+
uint64_t detected_size = test_alignments.back(); // Default to largest if no plateau found
730+
float max_iops = 0;
731+
732+
for (size_t i = 0; i < test_alignments.size() - 1; ++i) {
733+
auto current_alignment = test_alignments[i];
734+
auto next_alignment = test_alignments[i + 1];
735+
auto current_iops = alignment_to_iops[current_alignment];
736+
auto next_iops = alignment_to_iops[next_alignment];
737+
738+
max_iops = std::max(max_iops, current_iops);
739+
740+
// If the next larger alignment doesn't improve IOPS by more than 5%,
741+
// we've found the physical block size
742+
if (next_iops <= current_iops * 1.05) {
743+
detected_size = current_alignment;
744+
iotune_logger.info("Performance plateau detected at {} bytes", detected_size);
745+
break;
746+
}
747+
}
748+
749+
// If we didn't find a plateau, use the alignment with the best IOPS
750+
if (detected_size == test_alignments.back()) {
751+
for (const auto& [alignment, iops] : alignment_to_iops) {
752+
if (iops >= max_iops * 0.95) { // Within 5% of max
753+
detected_size = std::min(detected_size, alignment);
754+
}
755+
}
756+
iotune_logger.info("No clear plateau, using alignment with best performance: {} bytes", detected_size);
757+
}
758+
759+
iotune_logger.info("Detected physical block size: {} bytes", detected_size);
760+
761+
// Compare with driver-reported value if available
762+
if (reported_pbs) {
763+
if (detected_size == *reported_pbs) {
764+
iotune_logger.info("Detected value matches driver-reported value ({} bytes)", *reported_pbs);
765+
} else {
766+
iotune_logger.warn("Detected value differs from driver-reported value ({} bytes)", *reported_pbs);
767+
iotune_logger.warn("The disk may be lying about its physical block size");
768+
iotune_logger.warn("Using empirically detected value {} bytes to avoid write amplification", detected_size);
769+
}
770+
}
771+
772+
return detected_size;
773+
});
774+
}
775+
677776
iotune_multi_shard_context(::evaluation_directory dir, uint64_t random_write_io_buffer_size, uint64_t random_read_io_buffer_size)
678777
: _test_directory(dir)
679778
, _random_write_io_buffer_size(random_write_io_buffer_size)
@@ -689,6 +788,7 @@ struct disk_descriptor {
689788
uint64_t write_bw;
690789
std::optional<uint64_t> read_sat_len;
691790
std::optional<uint64_t> write_sat_len;
791+
std::optional<uint64_t> physical_block_size;
692792
};
693793

694794
void string_to_file(sstring conf_file, sstring buf) {
@@ -727,6 +827,9 @@ void write_property_file(sstring conf_file, std::vector<disk_descriptor> disk_de
727827
if (desc.write_sat_len) {
728828
out << YAML::Key << "write_saturation_length" << YAML::Value << *desc.write_sat_len;
729829
}
830+
if (desc.physical_block_size) {
831+
out << YAML::Key << "physical_block_size" << YAML::Value << *desc.physical_block_size;
832+
}
730833
out << YAML::EndMap;
731834
}
732835
out << YAML::EndSeq;
@@ -971,6 +1074,14 @@ int main(int ac, char** av) {
9711074

9721075
desc.read_iops = best_read_iops.iops;
9731076
desc.write_iops = best_write_iops.iops;
1077+
1078+
// Detect physical block size
1079+
fmt::print("Detecting physical block size: ");
1080+
std::cout.flush();
1081+
auto detected_pbs = iotune_tests.detect_physical_block_size(duration, test_directory.reported_physical_block_size()).get();
1082+
fmt::print("{} bytes\n", detected_pbs);
1083+
desc.physical_block_size = detected_pbs;
1084+
9741085
disk_descriptors.push_back(std::move(desc));
9751086
}
9761087

0 commit comments

Comments
 (0)