fix: bump rotary and adjust top level images

drbh · drbh · commit 96dfbd97a2bf · 2025-10-29T15:43:59.000Z
diff --git a/benches/index.md b/benches/index.md
@@ -4,8 +4,7 @@
 ## [Layer Norm](layer_norm/)
 
 <div class="artifact-preview">
-<object data="layer_norm/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
-</object>
+<img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
 </div>
 
 | Implementation | Description |
@@ -16,8 +15,7 @@
 ## [Rotary Position Embeddings](rotary/)
 
 <div class="artifact-preview">
-<object data="rotary/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
-</object>
+<img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
 </div>
 
 | Implementation | Description |
@@ -28,8 +26,7 @@
 ## [Flash Attention](flash_attn/)
 
 <div class="artifact-preview">
-<object data="flash_attn/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
-</object>
+<img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
 </div>
 
 | Implementation | Description |
@@ -44,8 +41,7 @@
 ## [Causal Conv1D](causal_conv1d/)
 
 <div class="artifact-preview">
-<object data="causal_conv1d/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
-</object>
+<img src="causal_conv1d/results/artifacts/combine/latency.svg" alt="Causal Conv1D Latency" width="800">
 </div>
 
 | Implementation | Description |
@@ -56,8 +52,7 @@
 ## [Activation](activation/)
 
 <div class="artifact-preview">
-<object data="activation/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
-</object>
+<img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
 </div>
 
 | Implementation | Description |
@@ -68,8 +63,7 @@
 ## [ReLU](relu/)
 
 <div class="artifact-preview">
-<object data="relu/results/artifacts/combine/latency.svg" type="image/svg+xml" width="800">
-</object>
+<img src="relu/results/artifacts/combine/latency.svg" alt="ReLU Latency" width="800">
 </div>
 
 | Implementation | Description |
diff --git a/benches/rotary/impls/hf_kernels_rotary.md b/benches/rotary/impls/hf_kernels_rotary.md
@@ -60,5 +60,6 @@ run_benchmark(
     impl_name="hf_kernels_rotary",
     impl_tags={"family": "hf-kernels", "backend": "cuda"},
     impl_func=hf_kernels_rotary,
+    dtype="float32",
 )
 ```
diff --git a/tools/kernels_benchmark_tools/__init__.py b/tools/kernels_benchmark_tools/__init__.py
@@ -50,11 +50,15 @@ def run_benchmark(
     impl_name: str | None = None,
     impl_tags: dict | None = None,
     impl_func=None,
+    reps: int = 5,
+    warmup: int = 2,
+    dtype: str | None = None,
+    device: str | None = None,
     **kwargs,
 ):
     # Determine device and dtype (TODO: allow user override)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = "float32" if device == "cpu" else "bfloat16"
+    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = dtype or ("float32" if device == "cpu" else "bfloat16")
 
     # Get the kernel module based on type (TODO: handle invalid type)
     kernel_module = KERNEL_MODULES[kernel_type]
@@ -77,8 +81,8 @@ def run_benchmark(
     run(
         wl,
         jsonl=f"{kernel_type.value}.jsonl",
-        reps=5,
-        warmup=2,
+        reps=reps,
+        warmup=warmup,
         gen=kernel_module.gen_inputs,
         ref=kernel_module.ref_impl,
         cmp=kernel_module.cmp_allclose,
diff --git a/tools/kernels_benchmark_tools/core/harness.py b/tools/kernels_benchmark_tools/core/harness.py
@@ -92,6 +92,10 @@ def run(
     env = _env_block()
     now = lambda: time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
 
+    # clear old results
+    if os.path.exists(jsonl):
+        os.remove(jsonl)
+
     for wl in workloads:
         inputs = gen(wl)
         ref_out = ref(inputs)
diff --git a/tools/kernels_benchmark_tools/rotary.py b/tools/kernels_benchmark_tools/rotary.py
@@ -130,3 +130,19 @@ def workloads(dtype="float32", device="cuda") -> Iterable[dict]:
                         "device": device,
                         "seed": 0,
                     }
+
+
+# single workload for quick testing
+def _workloads(dtype="float32", device="cuda") -> Iterable[dict]:
+    print("✅ Using single workload for quick testing.")
+    yield {
+        "name": f"{device}_B1_S128_H8_D64_R32",
+        "batch": 1,
+        "seqlen": 128,
+        "num_heads": 8,
+        "head_dim": 64,
+        "rotary_dim": 32,
+        "dtype": dtype,
+        "device": device,
+        "seed": 0,
+    }

Original file line number	Diff line number	Diff line change
`@@ -60,5 +60,6 @@ run_benchmark(`
`60`	`60`	`impl_name="hf_kernels_rotary",`
`61`	`61`	`impl_tags={"family": "hf-kernels", "backend": "cuda"},`
`62`	`62`	`impl_func=hf_kernels_rotary,`
	`63`	`+ dtype="float32",`
`63`	`64`	`)`
`64`	`65`	```