JuliaCI
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 7 deletions b/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎Project.toml‎
Lines changed: 9 additions & 2 deletions b/‎Project.toml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎ext/LinuxPerfExt/LinuxPerfExt.jl‎
Lines changed: 49 additions & 0 deletions b/‎ext/LinuxPerfExt/LinuxPerfExt.jl‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/BenchmarkTools.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/BenchmarkTools.jl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/execution.jl‎
Lines changed: 48 additions & 9 deletions b/‎src/execution.jl‎
Lines changed: 48 additions & 9 deletions
diff --git a/‎src/groups.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/groups.jl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/parameters.jl‎
Lines changed: 27 additions & 2 deletions b/‎src/parameters.jl‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎src/serialization.jl‎
Lines changed: 32 additions & 2 deletions b/‎src/serialization.jl‎
Lines changed: 32 additions & 2 deletions
@@ -16,20 +16,14 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.10'
           - '1'
           - 'nightly'
         arch:
           - x64
         os:
           - ubuntu-latest
         include:
-          - version: '1.7'
-            arch: x64
-            os: ubuntu-20.04
-          - version: '1.8'
-            arch: x64
-            os: ubuntu-22.04
           - version: '1.9'
             arch: x64
             os: ubuntu-22.04
 
@@ -11,6 +11,12 @@ Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
+[weakdeps]
+LinuxPerf = "b4c46c6c-4fb0-484d-a11a-41bc3392d094"
+
+[extensions]
+LinuxPerfExt = "LinuxPerf"
+
 [compat]
 Aqua = "0.8"
 Compat = ">= 4.11.0"
@@ -22,7 +28,8 @@ Profile = "<0.0.1, 1"
 Statistics = "<0.0.1, 1"
 Test = "<0.0.1, 1"
 UUIDs = "<0.0.1, 1"
-julia = "1.6"
+julia = "1.9"
+LinuxPerf = ">= 0.4"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
@@ -31,4 +38,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "JuliaFormatter", "Statistics", "Test"]
+test = ["Aqua", "JuliaFormatter", "Statistics", "Test", "LinuxPerf"]
@@ -0,0 +1,49 @@
+module LinuxPerfExt
+
+import BenchmarkTools: PerfInterface
+import LinuxPerf: LinuxPerf, PerfBench, EventGroup, EventType
+import LinuxPerf: enable!, disable!, enable_all!, disable_all!, close, read!
+
+function interface()
+    let g = try
+            EventGroup([EventType(:hw, :instructions), EventType(:hw, :branches)])
+        catch
+            # If perf is not working on the system, the above constructor will throw an
+            # ioctl or perf_event_open error (after presenting a warning to the user)
+            return PerfInterface()
+        end
+        close(g)
+        length(g.fds) != 2 && return PerfInterface()
+    end
+
+    # If we made it here, perf seems to be working on this system
+    return PerfInterface(;
+        setup=() ->
+            let g = EventGroup([EventType(:hw, :instructions), EventType(:hw, :branches)])
+                PerfBench(0, EventGroup[g])
+            end,
+        start=(bench) -> enable_all!(),
+        stop=(bench) -> disable_all!(),
+        # start=(bench) -> enable!(bench),
+        # stop=(bench) -> disable!(bench),
+        teardown=(bench) -> close(bench),
+        read=(bench) -> let g = only(bench.groups)
+            (N, time_enabled, time_running, insts, branches) = read!(
+                g.leader_io, Vector{UInt64}(undef, 5)
+            )
+            if 2 * time_running <= time_enabled
+                # enabled less than 50% of the time
+                # (most likely due to PMU contention with other perf events)
+                return (NaN, NaN)
+            else
+                # account for partially-active measurement
+                k = time_enabled / time_running
+                estimated_instructions = Float64(insts) * k
+                estimated_branches = Float64(branches) * k
+                return (estimated_instructions, estimated_branches)
+            end
+        end,
+    )
+end
+
+end
@@ -25,6 +25,8 @@ export loadparams!
 include("trials.jl")
 
 export gctime,
+    instructions,
+    branches,
     memory,
     allocs,
     params,
 
@@ -506,6 +506,24 @@ macro benchmarkable(args...)
     end
 end
 
+struct PerfInterface
+    setup::Function
+    start::Function
+    stop::Function
+    read::Function
+    teardown::Function
+
+    function PerfInterface(;
+        setup=Returns(nothing),
+        start=Returns(nothing),
+        stop=Returns(nothing),
+        read=Returns((NaN, NaN)),
+        teardown=Returns(nothing),
+    )
+        return new(setup, start, stop, read, teardown)
+    end
+end
+
 # `eval` an expression that forcibly defines the specified benchmark at
 # top-level in order to allow transfer of locally-scoped variables into
 # benchmark scope.
@@ -553,6 +571,8 @@ function generate_benchmark_definition(
             end
         )
     end
+    ext = Base.get_extension(BenchmarkTools, :LinuxPerfExt)
+    LinuxPerf = isnothing(ext) ? PerfInterface() : ext.interface()
     return Core.eval(
         eval_module,
         quote
@@ -563,17 +583,34 @@ function generate_benchmark_definition(
                 $(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters
             )
                 $(setup)
+                __perf_bench = $(LinuxPerf.setup)()
+                __gcdiff = nothing
+                __return_val = nothing
+                __sample_time::Int64 = 0
+                __sample_instructions::Float64 = 0
+                __sample_branches::Float64 = 0
                 __evals = __params.evals
-                __gc_start = Base.gc_num()
-                __start_time = time_ns()
-                __return_val = $(invocation)
-                for __iter in 2:__evals
-                    $(invocation)
+                try
+                    __gc_start = Base.gc_num()
+                    $(LinuxPerf.start)(__perf_bench)
+                    __start_time = time_ns()
+                    __return_val = $(invocation)
+                    for __iter in 2:__evals
+                        $(invocation)
+                    end
+                    __sample_time = time_ns() - __start_time
+                    $(LinuxPerf.stop)(__perf_bench)
+                    __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
+                    __sample_instructions, __sample_branches = $(LinuxPerf.read)(
+                        __perf_bench
+                    )
+                finally
+                    $(LinuxPerf.teardown)(__perf_bench)
+                    $(teardown)
                 end
-                __sample_time = time_ns() - __start_time
-                __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
-                $(teardown)
                 __time = max((__sample_time / __evals) - __params.overhead, 0.001)
+                __instructions = max(__sample_instructions / __evals, 0.0) # may be NaN
+                __branches = max(__sample_branches / __evals, 0.0) # may be NaN
                 __gctime = max((__gcdiff.total_time / __evals) - __params.overhead, 0.0)
                 __memory = Int(Base.fld(__gcdiff.allocd, __evals))
                 __allocs = Int(
@@ -585,7 +622,9 @@ function generate_benchmark_definition(
                         __evals,
                     ),
                 )
-                return __time, __gctime, __memory, __allocs, __return_val
+                return __time,
+                __instructions, __branches, __gctime, __memory, __allocs,
+                __return_val
             end
             $BenchmarkTools.Benchmark($(samplefunc), $(quote_vals), $(params))
         end,
 
@@ -113,6 +113,8 @@ Base.min(groups::BenchmarkGroup...) = mapvals(min, groups...)
 Base.max(groups::BenchmarkGroup...) = mapvals(max, groups...)
 
 Base.time(group::BenchmarkGroup) = mapvals(time, group)
+instructions(group::BenchmarkGroup) = mapvals(instructions, group)
+branches(group::BenchmarkGroup) = mapvals(branches, group)
 gctime(group::BenchmarkGroup) = mapvals(gctime, group)
 memory(group::BenchmarkGroup) = mapvals(memory, group)
 allocs(group::BenchmarkGroup) = mapvals(allocs, group)
 
@@ -14,10 +14,14 @@ mutable struct Parameters
     gctrial::Bool
     gcsample::Bool
     time_tolerance::Float64
+    instruction_tolerance::Float64
+    branch_tolerance::Float64
     memory_tolerance::Float64
 end
 
-const DEFAULT_PARAMETERS = Parameters(5.0, 10000, 1, false, 0, true, false, 0.05, 0.01)
+const DEFAULT_PARAMETERS = Parameters(
+    5.0, 10000, 1, false, 0, true, false, 0.05, 0.05, 0.05, 0.01
+)
 
 function Parameters(;
     seconds=DEFAULT_PARAMETERS.seconds,
@@ -28,6 +32,8 @@ function Parameters(;
     gctrial=DEFAULT_PARAMETERS.gctrial,
     gcsample=DEFAULT_PARAMETERS.gcsample,
     time_tolerance=DEFAULT_PARAMETERS.time_tolerance,
+    instruction_tolerance=DEFAULT_PARAMETERS.instruction_tolerance,
+    branch_tolerance=DEFAULT_PARAMETERS.branch_tolerance,
     memory_tolerance=DEFAULT_PARAMETERS.memory_tolerance,
 )
     return Parameters(
@@ -39,6 +45,8 @@ function Parameters(;
         gctrial,
         gcsample,
         time_tolerance,
+        instruction_tolerance,
+        branch_tolerance,
         memory_tolerance,
     )
 end
@@ -52,6 +60,8 @@ function Parameters(
     gctrial=nothing,
     gcsample=nothing,
     time_tolerance=nothing,
+    instruction_tolerance=nothing,
+    branch_tolerance=nothing,
     memory_tolerance=nothing,
 )
     params = Parameters()
@@ -63,6 +73,13 @@ function Parameters(
     params.gcsample = gcsample != nothing ? gcsample : default.gcsample
     params.time_tolerance =
         time_tolerance != nothing ? time_tolerance : default.time_tolerance
+    params.instruction_tolerance = if instruction_tolerance != nothing
+        instruction_tolerance
+    else
+        default.instruction_tolerance
+    end
+    params.branch_tolerance =
+        branch_tolerance != nothing ? branch_tolerance : default.branch_tolerance
     params.memory_tolerance =
         memory_tolerance != nothing ? memory_tolerance : default.memory_tolerance
     return params::BenchmarkTools.Parameters
@@ -76,6 +93,8 @@ function Base.:(==)(a::Parameters, b::Parameters)
            a.gctrial == b.gctrial &&
            a.gcsample == b.gcsample &&
            a.time_tolerance == b.time_tolerance &&
+           a.instruction_tolerance == b.instruction_tolerance &&
+           a.branch_tolerance == b.branch_tolerance &&
            a.memory_tolerance == b.memory_tolerance
 end
 
@@ -89,6 +108,8 @@ function Base.copy(p::Parameters)
         p.gctrial,
         p.gcsample,
         p.time_tolerance,
+        p.instruction_tolerance,
+        p.branch_tolerance,
         p.memory_tolerance,
     )
 end
@@ -109,7 +130,11 @@ end
 
 @noinline function overhead_sample(evals)
     start_time = time_ns()
-    for _ in 1:evals
+    try
+        for _ in 1:evals
+            nullfunc()
+        end
+    finally
         nullfunc()
     end
     sample_time = time_ns() - start_time
 
@@ -55,8 +55,38 @@ function recover(x::Vector)
         else
             xsi = if fn == "evals_set" && !haskey(fields, fn)
                 false
-            elseif fn in ("seconds", "overhead", "time_tolerance", "memory_tolerance") &&
-                fields[fn] === nothing
+            elseif fn in ("instructions", "branches")
+                # JSON spec doesn't support NaN, so handle it specially here
+                if !haskey(fields, fn)
+                    if ft === Vector{Float64}
+                        Float64[NaN for _ in length(fields["time"])]
+                    elseif ft === Float64
+                        NaN
+                    else
+                        @assert false
+                    end
+                else
+                    if ft === Vector{Float64}
+                        Float64[
+                            elem === nothing ? NaN : convert(Float64, elem) for
+                            elem in fields[fn]
+                        ]
+                    else
+                        fields[fn] === nothing ? NaN : convert(ft, fields[fn])
+                    end
+                end
+            elseif fn == "instruction_tolerance" && !haskey(fields, fn)
+                DEFAULT_PARAMETERS.instruction_tolerance
+            elseif fn == "branch_tolerance" && !haskey(fields, fn)
+                DEFAULT_PARAMETERS.branch_tolerance
+            elseif fn in (
+                "seconds",
+                "overhead",
+                "time_tolerance",
+                "instruction_tolerance",
+                "branch_tolerance",
+                "memory_tolerance",
+            ) && fields[fn] === nothing
                 # JSON spec doesn't support Inf
                 # These fields should all be >= 0, so we can ignore -Inf case
                 typemax(ft)