Annotate the encoding benchmark

byroot · byroot · commit 77e97b3d4e8e · 2024-10-18T11:04:44.000+02:00
Note where we currently stand, what the current bottlencks are
and what could or can't be done.

```
== Encoding small nested array (121 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin23]
Warming up --------------------------------------
                json   129.145k i/100ms
        json (reuse)   239.395k i/100ms
                  oj   211.514k i/100ms
           rapidjson   130.660k i/100ms
Calculating -------------------------------------
                json      1.284M (± 0.3%) i/s  (779.11 ns/i) -      6.457M in   5.030954s
        json (reuse)      2.405M (± 0.1%) i/s  (415.77 ns/i) -     12.209M in   5.076202s
                  oj      2.118M (± 0.0%) i/s  (472.11 ns/i) -     10.787M in   5.092795s
           rapidjson      1.325M (± 1.3%) i/s  (754.82 ns/i) -      6.664M in   5.030763s

Comparison:
                json:  1283514.8 i/s
        json (reuse):  2405175.0 i/s - 1.87x  faster
                  oj:  2118132.9 i/s - 1.65x  faster
           rapidjson:  1324820.8 i/s - 1.03x  faster

== Encoding small hash (65 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin23]
Warming up --------------------------------------
                json   177.502k i/100ms
        json (reuse)   485.963k i/100ms
                  oj   656.566k i/100ms
           rapidjson   227.985k i/100ms
Calculating -------------------------------------
                json      1.774M (± 3.1%) i/s  (563.67 ns/i) -      8.875M in   5.007964s
        json (reuse)      4.804M (± 3.0%) i/s  (208.16 ns/i) -     24.298M in   5.062426s
                  oj      6.564M (± 1.9%) i/s  (152.36 ns/i) -     32.828M in   5.003539s
           rapidjson      2.229M (± 2.0%) i/s  (448.59 ns/i) -     11.171M in   5.013299s

Comparison:
                json:  1774084.6 i/s
                  oj:  6563547.8 i/s - 3.70x  faster
        json (reuse):  4804083.0 i/s - 2.71x  faster
           rapidjson:  2229209.5 i/s - 1.26x  faster

== Encoding twitter.json (466906 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin23]
Warming up --------------------------------------
                json   212.000 i/100ms
                  oj   222.000 i/100ms
           rapidjson   109.000 i/100ms
Calculating -------------------------------------
                json      2.135k (± 0.7%) i/s  (468.32 μs/i) -     10.812k in   5.063665s
                  oj      2.219k (± 1.9%) i/s  (450.69 μs/i) -     11.100k in   5.004642s
           rapidjson      1.093k (± 3.8%) i/s  (914.66 μs/i) -      5.559k in   5.090812s

Comparison:
                json:     2135.3 i/s
                  oj:     2218.8 i/s - 1.04x  faster
           rapidjson:     1093.3 i/s - 1.95x  slower

== Encoding citm_catalog.json (500298 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin23]
Warming up --------------------------------------
                json   132.000 i/100ms
                  oj   126.000 i/100ms
           rapidjson    96.000 i/100ms
Calculating -------------------------------------
                json      1.304k (± 2.2%) i/s  (766.96 μs/i) -      6.600k in   5.064483s
                  oj      1.272k (± 0.8%) i/s  (786.14 μs/i) -      6.426k in   5.052044s
           rapidjson    997.370 (± 4.8%) i/s    (1.00 ms/i) -      4.992k in   5.016266s

Comparison:
                json:     1303.9 i/s
                  oj:     1272.0 i/s - same-ish: difference falls within error
           rapidjson:      997.4 i/s - 1.31x  slower

== Encoding canada.json (2090234 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin23]
Warming up --------------------------------------
                json     2.000 i/100ms
                  oj     3.000 i/100ms
           rapidjson     1.000 i/100ms
Calculating -------------------------------------
                json     20.001 (± 0.0%) i/s   (50.00 ms/i) -    102.000 in   5.100950s
                  oj     30.823 (± 0.0%) i/s   (32.44 ms/i) -    156.000 in   5.061333s
           rapidjson     19.446 (± 0.0%) i/s   (51.42 ms/i) -     98.000 in   5.041884s

Comparison:
                json:       20.0 i/s
                  oj:       30.8 i/s - 1.54x  faster
           rapidjson:       19.4 i/s - 1.03x  slower

== Encoding many #to_json calls (2661 bytes)
oj does not match expected output. Skipping
rapidjson unsupported (Invalid object key type: Object)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin23]
Warming up --------------------------------------
                json     2.200k i/100ms
Calculating -------------------------------------
                json     22.253k (± 0.2%) i/s   (44.94 μs/i) -    112.200k in   5.041962s
```
diff --git a/benchmark/encoder.rb b/benchmark/encoder.rb
@@ -14,20 +14,26 @@
 end
 
 def implementations(ruby_obj)
+  state = JSON::State.new(JSON.dump_default_options)
+
   {
     json: ["json", proc { JSON.dump(ruby_obj) }],
+    json_state: ["json (reuse)", proc { state.generate(ruby_obj) }],
     oj: ["oj", proc { Oj.dump(ruby_obj) }],
     rapidjson: ["rapidjson", proc { RapidJSON.dump(ruby_obj) }],
   }
 end
 
-def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true)
+def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [])
   json_output = JSON.dump(ruby_obj)
   puts "== Encoding #{benchmark_name} (#{json_output.bytesize} bytes)"
 
+  impls = implementations(ruby_obj).select { |name| RUN[name] }
+  except.each { |i| impls.delete(i) }
+
   Benchmark.ips do |x|
     expected = ::JSON.dump(ruby_obj) if check_expected
-    implementations(ruby_obj).select { |name| RUN[name] }.values.each do |name, block|
+    impls.values.each do |name, block|
       begin
         result = block.call
         if check_expected && expected != result
@@ -45,9 +51,26 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true)
   puts
 end
 
+# On the first two micro benchmarks, the limitting factor is that we have to create a Generator::State object for every
+# call to `JSON.dump`, so we cause 2 allocations per call where alternatives only do one allocation.
+# The performance difference is mostly more time spent in GC because of this extra pressure.
+# If we re-use the same `JSON::State` instance, we're faster than Oj on the array benchmark, and much closer
+# on the Hash one.
 benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
 benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }
-benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json")
-benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json")
-benchmark_encoding "canada.json", JSON.load_file("#{__dir__}/data/canada.json"), check_expected: false
-benchmark_encoding "many #to_json calls", [{Object.new => Object.new, 12 => 54.3, Integer => Float, Time.now => Date.today}] * 20
+
+# On these two benchmark we perform well.
+benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)
+benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json"), except: %i(json_state)
+
+# This benchmark spent the overwhelming majority of its time in `ruby_dtoa`. We rely on Ruby's implementation
+# which uses a relatively old version of dtoa.c from David M. Gay.
+# Oj is noticeably faster here because it limits the precision of floats, breaking roundtriping. That's not
+# something we should emulate.
+# Since a few years there are now much faster float to string implementations such as Ryu, Dragonbox, etc,
+# but all these are implemented in C++11 or newer, making it hard if not impossible to include them.
+# Short of a pure C99 implementation of these newer algorithms, there isn't much that can be done to match
+# Oj speed without losing precision.
+benchmark_encoding "canada.json", JSON.load_file("#{__dir__}/data/canada.json"), check_expected: false, except: %i(json_state)
+
+benchmark_encoding "many #to_json calls", [{Object.new => Object.new, 12 => 54.3, Integer => Float, Time.now => Date.today}] * 20, except: %i(json_state)