diff --git a/Project.toml b/Project.toml
index a91c231d..b48447a1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,14 +4,17 @@ version = "1.3.2"
 
 [deps]
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+LinuxPerf = "b4c46c6c-4fb0-484d-a11a-41bc3392d094"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
 JSON = "0.18, 0.19, 0.20, 0.21"
+LinuxPerf = "= 0.3.5"
 julia = "1"
 
 [extras]
diff --git a/src/BenchmarkTools.jl b/src/BenchmarkTools.jl
index c5d23077..445ad3a5 100644
--- a/src/BenchmarkTools.jl
+++ b/src/BenchmarkTools.jl
@@ -9,6 +9,8 @@ using UUIDs: uuid4
 using Printf
 using Profile
 
+import LinuxPerf
+import Random
 
 const BENCHMARKTOOLS_VERSION = v"1.0.0"
 
diff --git a/src/execution.jl b/src/execution.jl
index 16bb6532..e8325b66 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -96,9 +96,9 @@ function _run(b::Benchmark, p::Parameters; verbose = false, pad = "", kwargs...)
     start_time = Base.time()
     trial = Trial(params)
     params.gcsample && gcscrub()
-    s = b.samplefunc(b.quote_vals, params)
-    push!(trial, s[1:end-1]...)
-    return_val = s[end]
+    trial_contents = b.samplefunc(b.quote_vals, params)
+    push!(trial, trial_contents)
+    return_val = trial_contents.__return_val
     iters = 2
     while (Base.time() - start_time) < params.seconds && iters ≤ params.samples
          params.gcsample && gcscrub()
@@ -492,6 +492,9 @@ function generate_benchmark_definition(eval_module, out_vars, setup_vars, quote_
                            x
                        end)
     end
+    experimental_enable_linux_perf = true # TODO: take this as input from the user
+    # TODO: let the user actually provide these options.
+    linux_perf_opts = LinuxPerf.parse_pstats_options([])
     return Core.eval(eval_module, quote
         @noinline $(signature_def) = begin $(core_body) end
         @noinline function $(samplefunc)($(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters)
@@ -512,7 +515,36 @@ function generate_benchmark_definition(eval_module, out_vars, setup_vars, quote_
             __allocs = Int(Base.fld(__gcdiff.malloc + __gcdiff.realloc +
                                __gcdiff.poolalloc + __gcdiff.bigalloc,
                                __evals))
-            return __time, __gctime, __memory, __allocs, __return_val
+            if $(experimental_enable_linux_perf)
+                # Based on https://github.com/JuliaPerf/LinuxPerf.jl/blob/a7fee0ff261a5b5ce7a903af7b38d1b5c27dd931/src/LinuxPerf.jl#L1043-L1061
+                __linux_perf_groups = LinuxPerf.set_default_spaces(
+                    $(linux_perf_opts.events),
+                    $(linux_perf_opts.spaces),
+                )
+                __linux_perf_bench = LinuxPerf.make_bench_threaded(
+                    __linux_perf_groups;
+                    threads = $(linux_perf_opts.threads),
+                )
+                LinuxPerf.enable!(__linux_perf_bench)
+                # We'll just run it one time.
+                __return_val_2 = $(invocation)
+                LinuxPerf.disable!(__linux_perf_bench)
+                # trick the compiler not to eliminate the code
+                if rand() < 0
+                    __linux_perf_stats =  __return_val_2
+                else
+                    __linux_perf_stats =  LinuxPerf.Stats(__linux_perf_bench)
+                end
+            end
+            return (;
+                __time,
+                __gctime,
+                __memory,
+                __allocs,
+                __return_val,
+                __return_val_2,
+                __linux_perf_stats,
+            )
         end
         $BenchmarkTools.Benchmark($(samplefunc), $(quote_vals), $(params))
     end)
diff --git a/src/trials.jl b/src/trials.jl
index e24d9271..38fa7296 100644
--- a/src/trials.jl
+++ b/src/trials.jl
@@ -8,6 +8,7 @@ mutable struct Trial
     gctimes::Vector{Float64}
     memory::Int
     allocs::Int
+    linux_perf_stats::Union{LinuxPerf.Stats, Nothing}
 end
 
 Trial(params::Parameters) = Trial(params, Float64[], Float64[], typemax(Int), typemax(Int))
@@ -22,11 +23,25 @@ end
 
 Base.copy(t::Trial) = Trial(copy(t.params), copy(t.times), copy(t.gctimes), t.memory, t.allocs)
 
-function Base.push!(t::Trial, time, gctime, memory, allocs)
+const TrialContents = NamedTuple{(
+    :__time,
+    :__gctime,
+    :__memory,
+    :__allocs,
+    :__return_val,
+    :__linux_perf_stats,
+)}
+
+function Base.push!(t::Trial, trial_contents::TrialContents)
+    time = trial_contents.__time
+    gctime = trial_contents.__gctime
+    memory =  trial_contents.__memory
+    allocs = trial_contents.__allocs
     push!(t.times, time)
     push!(t.gctimes, gctime)
     memory < t.memory && (t.memory = memory)
     allocs < t.allocs && (t.allocs = allocs)
+    trial.linux_perf_stats = trial_contents.__linux_perf_stats
     return t
 end