2626#include < optional>
2727#include < random>
2828#include < memory>
29+ #include < map>
2930#include < ranges>
3031#include < vector>
3132#include < cmath>
@@ -106,6 +107,7 @@ struct evaluation_directory {
106107 uint64_t _available_space;
107108 uint64_t _min_data_transfer_size = 512 ;
108109 unsigned _disks_per_array = 0 ;
110+ std::optional<uint64_t > _reported_physical_block_size;
109111
110112 void scan_device (unsigned dev_maj, unsigned dev_min) {
111113 scan_device (fmt::format (" {}:{}" , dev_maj, dev_min));
@@ -140,6 +142,15 @@ struct evaluation_directory {
140142
141143 _min_data_transfer_size = std::max (_min_data_transfer_size, disk_min_io_size);
142144 _max_iodepth += read_first_line_as<uint64_t >(queue_dir / " nr_requests" );
145+
146+ // Read physical_block_size from sysfs for comparison with detected value
147+ if (!_reported_physical_block_size) {
148+ auto pbs_file = queue_dir / " physical_block_size" ;
149+ if (fs::exists (pbs_file)) {
150+ _reported_physical_block_size = read_first_line_as<uint64_t >(pbs_file);
151+ }
152+ }
153+
143154 _disks_per_array++;
144155 }
145156 } catch (std::system_error& se) {
@@ -178,6 +189,10 @@ struct evaluation_directory {
178189 return _min_data_transfer_size;
179190 }
180191
192+ std::optional<uint64_t > reported_physical_block_size () const {
193+ return _reported_physical_block_size;
194+ }
195+
181196 future<> discover_directory () {
182197 return seastar::async ([this ] {
183198 auto f = open_directory (_name).get ();
@@ -674,6 +689,90 @@ class iotune_multi_shard_context {
674689 return saturate (rate_threshold, buffer_size, duration, &test_file::read_workload);
675690 }
676691
692+ // Detect physical block size by testing write performance at different alignments.
693+ // Returns the smallest alignment where write performance is good (no RMW penalty).
694+ // Tests alignments from 512 bytes up to 8192 bytes.
695+ //
696+ // The detection works by measuring write IOPS at each alignment. If writes smaller
697+ // than the physical block size are issued, the disk must perform read-modify-write
698+ // (RMW) operations, which reduces IOPS. Once we reach the physical block size,
699+ // IOPS should plateau as larger alignments don't provide additional benefit.
700+ future<uint64_t > detect_physical_block_size (std::chrono::duration<double > duration, std::optional<uint64_t > reported_pbs = std::nullopt ) {
701+ return seastar::async ([this , duration, reported_pbs] {
702+ // Test alignments: 512, 1024, 2048, 4096, 8192
703+ // This covers most common physical block sizes
704+ std::vector<uint64_t > test_alignments = {512 , 1024 , 2048 , 4096 , 8192 };
705+ std::map<uint64_t , float > alignment_to_iops;
706+
707+ iotune_logger.info (" Detecting physical block size by testing write performance at different alignments" );
708+
709+ for (auto alignment : test_alignments) {
710+ // Perform a short write test at this alignment (5% of total test duration)
711+ auto rates = write_random_data (alignment, duration * 0.05 ).get ();
712+ alignment_to_iops[alignment] = rates.iops ;
713+
714+ iotune_logger.info (" Alignment {} bytes: {} IOPS" , alignment, uint64_t (rates.iops ));
715+
716+ // Clear rates after each test
717+ sharded_rates.invoke_on_all ([] (std::vector<unsigned >& rates) {
718+ rates.clear ();
719+ return make_ready_future<>();
720+ }).get ();
721+ }
722+
723+ // Find the smallest alignment where performance plateaus.
724+ // We look for the point where increasing alignment doesn't significantly improve IOPS.
725+ // This indicates we've reached or exceeded the physical block size.
726+ //
727+ // Algorithm: Find the first alignment where the next larger alignment doesn't
728+ // improve IOPS by more than 5%. This is the physical block size.
729+ uint64_t detected_size = test_alignments.back (); // Default to largest if no plateau found
730+ float max_iops = 0 ;
731+
732+ for (size_t i = 0 ; i < test_alignments.size () - 1 ; ++i) {
733+ auto current_alignment = test_alignments[i];
734+ auto next_alignment = test_alignments[i + 1 ];
735+ auto current_iops = alignment_to_iops[current_alignment];
736+ auto next_iops = alignment_to_iops[next_alignment];
737+
738+ max_iops = std::max (max_iops, current_iops);
739+
740+ // If the next larger alignment doesn't improve IOPS by more than 5%,
741+ // we've found the physical block size
742+ if (next_iops <= current_iops * 1.05 ) {
743+ detected_size = current_alignment;
744+ iotune_logger.info (" Performance plateau detected at {} bytes" , detected_size);
745+ break ;
746+ }
747+ }
748+
749+ // If we didn't find a plateau, use the alignment with the best IOPS
750+ if (detected_size == test_alignments.back ()) {
751+ for (const auto & [alignment, iops] : alignment_to_iops) {
752+ if (iops >= max_iops * 0.95 ) { // Within 5% of max
753+ detected_size = std::min (detected_size, alignment);
754+ }
755+ }
756+ iotune_logger.info (" No clear plateau, using alignment with best performance: {} bytes" , detected_size);
757+ }
758+
759+ iotune_logger.info (" Detected physical block size: {} bytes" , detected_size);
760+
761+ // Compare with driver-reported value if available
762+ if (reported_pbs) {
763+ if (detected_size == *reported_pbs) {
764+ iotune_logger.info (" Detected value matches driver-reported value ({} bytes)" , *reported_pbs);
765+ } else {
766+ iotune_logger.warn (" Detected value differs from driver-reported value ({} bytes)" , *reported_pbs);
767+ iotune_logger.warn (" The disk may be lying about its physical block size" );
768+ iotune_logger.warn (" Using empirically detected value {} bytes to avoid write amplification" , detected_size);
769+ }
770+ }
771+
772+ return detected_size;
773+ });
774+ }
775+
677776 iotune_multi_shard_context (::evaluation_directory dir, uint64_t random_write_io_buffer_size, uint64_t random_read_io_buffer_size)
678777 : _test_directory(dir)
679778 , _random_write_io_buffer_size(random_write_io_buffer_size)
@@ -689,6 +788,7 @@ struct disk_descriptor {
689788 uint64_t write_bw;
690789 std::optional<uint64_t > read_sat_len;
691790 std::optional<uint64_t > write_sat_len;
791+ std::optional<uint64_t > physical_block_size;
692792};
693793
694794void string_to_file (sstring conf_file, sstring buf) {
@@ -727,6 +827,9 @@ void write_property_file(sstring conf_file, std::vector<disk_descriptor> disk_de
727827 if (desc.write_sat_len ) {
728828 out << YAML::Key << " write_saturation_length" << YAML::Value << *desc.write_sat_len ;
729829 }
830+ if (desc.physical_block_size ) {
831+ out << YAML::Key << " physical_block_size" << YAML::Value << *desc.physical_block_size ;
832+ }
730833 out << YAML::EndMap;
731834 }
732835 out << YAML::EndSeq;
@@ -971,6 +1074,14 @@ int main(int ac, char** av) {
9711074
9721075 desc.read_iops = best_read_iops.iops ;
9731076 desc.write_iops = best_write_iops.iops ;
1077+
1078+ // Detect physical block size
1079+ fmt::print (" Detecting physical block size: " );
1080+ std::cout.flush ();
1081+ auto detected_pbs = iotune_tests.detect_physical_block_size (duration, test_directory.reported_physical_block_size ()).get ();
1082+ fmt::print (" {} bytes\n " , detected_pbs);
1083+ desc.physical_block_size = detected_pbs;
1084+
9741085 disk_descriptors.push_back (std::move (desc));
9751086 }
9761087
0 commit comments